diff --git a/Makefile b/Makefile
index 6662b44f8..c98adc179 100644
--- a/Makefile
+++ b/Makefile
@@ -94,9 +94,24 @@ OBJ := $(filter-out $(BUILD_DIR)/nvmon.o,$(OBJ))
 OBJ := $(filter-out $(BUILD_DIR)/topology_gpu.o,$(OBJ))
 OBJ := $(filter-out $(BUILD_DIR)/libnvctr.o,$(OBJ))
 endif
+ifeq ($(COMPILER),GCCPOWER)
+OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_msr.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_pci.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/loadData.o,$(OBJ))
+endif
+ifeq ($(COMPILER),XLC)
+OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_msr.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_pci.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/loadData.o,$(OBJ))
+endif
 PERFMONHEADERS  = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildcard $(SRC_DIR)/includes/*.txt))
 OBJ_LUA    =  $(wildcard ./ext/lua/$(COMPILER)/*.o)
 OBJ_HWLOC  =  $(wildcard ./ext/hwloc/$(COMPILER)/*.o)
+OBJ_GOTCHA = $(wildcard ./ext/GOTCHA/$(COMPILER)/*.o)
 FILTERS := $(filter-out ./filters/README,$(wildcard ./filters/*))
 
 
@@ -118,12 +133,12 @@ endif
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
 
 ifeq ($(BUILDDAEMON),false)
-all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(FREQ_TARGET) $(BENCH_TARGET)
+all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET)
 else
 ifeq ($(BUILDFREQ),false)
-all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(BENCH_TARGET)
+all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET)
 else
-all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET)
+all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET)
 endif
 endif
 
@@ -185,10 +200,14 @@ $(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
 	@echo "===>  BUILD access daemon likwid-accessD"
 	$(Q)$(MAKE) -C  $(SRC_DIR)/access-daemon likwid-accessD
 
-$(FREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
+$(FREQ_TARGET): $(SRC_DIR)/access-daemon/setFreqDaemon.c
 	@echo "===>  BUILD frequency daemon likwid-setFreq"
 	$(Q)$(MAKE) -C  $(SRC_DIR)/access-daemon likwid-setFreq
 
+$(APPDAEMON_TARGET): $(SRC_DIR)/access-daemon/appDaemon.c $(TARGET_GOTCHA_LIB)
+	@echo "===>  BUILD application interface likwid-appDaemon.so"
+	$(Q)$(MAKE) -C  $(SRC_DIR)/access-daemon likwid-appDaemon.so
+
 $(BUILD_DIR):
 	@mkdir $(BUILD_DIR)
 
@@ -215,6 +234,11 @@ $(TARGET_LUA_LIB):
 	@echo "===>  EXTERNAL LUA"
 endif
 
+$(TARGET_GOTCHA_LIB):
+	@echo "===>  ENTER  $(GOTCHA_FOLDER)"
+	$(Q)$(MAKE) --no-print-directory -C $(GOTCHA_FOLDER) $(MAKECMDGOALS)
+
+
 $(TARGET_HWLOC_LIB):
 	@echo "===>  ENTER  $(HWLOC_FOLDER)"
 	$(Q)$(MAKE) --no-print-directory -C $(HWLOC_FOLDER) $(MAKECMDGOALS)
@@ -248,13 +272,13 @@ ifeq ($(findstring $(MAKECMDGOALS),clean),)
 -include $(OBJ:.o=.d)
 endif
 
-.PHONY: clean distclean install uninstall help $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
+.PHONY: clean distclean install uninstall help $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET)
 
 .PRECIOUS: $(BUILD_DIR)/%.pas
 
 .NOTPARALLEL:
 
-clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
+clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET)
 	@echo "===>  CLEAN"
 	@for APP in $(L_APPS); do \
 		rm -f $$APP; \
@@ -264,10 +288,10 @@ clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
 	@rm -f $(DYNAMIC_TARGET_LIB)*
 	@rm -f $(PINLIB)*
 	@rm -f $(FORTRAN_IF_NAME)
-	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
+	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET)
 	@rm -f likwid-config.cmake
 
-distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
+distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET)
 	@echo "===>  DIST CLEAN"
 	@for APP in $(L_APPS); do \
 		rm -f $$APP; \
@@ -277,10 +301,11 @@ distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(BENCH_TARGET)
 	@rm -f $(DYNAMIC_TARGET_LIB)*
 	@rm -f $(PINLIB)*
 	@rm -f $(FORTRAN_IF_NAME)
-	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
+	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET)
 	@rm -rf $(BUILD_DIR)
 	@rm -rf $(TARGET_LUA_LIB).* $(shell basename $(TARGET_LUA_LIB)).*
 	@rm -rf $(TARGET_HWLOC_LIB).* $(shell basename $(TARGET_HWLOC_LIB)).*
+	@rm -rf $(TARGET_GOTCHA_LIB).* $(shell basename $(TARGET_GOTCHA_LIB)).*
 	@rm -f $(GENGROUPLOCK)
 	@rm -f likwid-config.cmake
 	@rm -rf doc/html
@@ -360,7 +385,33 @@ uninstall_freq_moved:
 	@echo "===> No UNINSTALL of setFrequencies tool"
 endif
 
-install: install_daemon install_freq
+ifeq ($(BUILDAPPDAEMON),true)
+install_appdaemon:
+	@echo "===> INSTALL application interface appDaemon to $(PREFIX)/lib/$(APPDAEMON_TARGET)"
+	@mkdir -p $(PREFIX)/lib
+	@install -m 755 $(INSTALL_CHOWN) $(APPDAEMON_TARGET) $(PREFIX)/lib/$(APPDAEMON_TARGET)
+move_appdaemon:
+	@echo "===> MOVE application interface appDaemon from $(PREFIX)/lib/$(APPDAEMON_TARGET) to $(INSTALLED_PREFIX)/lib/$(APPDAEMON_TARGET)"
+	@mkdir -p $(INSTALLED_PREFIX)/lib
+	@install -m 755 $(INSTALL_CHOWN) $(PREFIX)/lib/$(APPDAEMON_TARGET) $(INSTALLED_PREFIX)/lib/$(APPDAEMON_TARGET)
+uninstall_appdaemon:
+	@echo "===> REMOVING application interface appDaemon from $(PREFIX)/lib/$(APPDAEMON_TARGET)"
+	@rm -f $(PREFIX)/lib/$(APPDAEMON_TARGET)
+uninstall_appdaemon_moved:
+	@echo "===> REMOVING application interface appDaemon from $(INSTALLED_PREFIX)/lib/$(APPDAEMON_TARGET)"
+	@rm -f $(INSTALLED_PREFIX)/lib/$(APPDAEMON_TARGET)
+else
+install_appdaemon:
+	@echo "===> No INSTALL of the application interface appDaemon"
+move_appdaemon:
+	@echo "===> No MOVE of the application interface appDaemon"
+uninstall_appdaemon:
+	@echo "===> No UNINSTALL of the application interface appDaemon"
+uninstall_appdaemon_moved:
+	@echo "===> No UNINSTALL of the application interface appDaemon"
+endif
+
+install: install_daemon install_freq install_appdaemon
 	@echo "===> INSTALL applications to $(BINPREFIX)"
 	@mkdir -p $(BINPREFIX)
 	@chmod 755 $(BINPREFIX)
@@ -388,6 +439,7 @@ install: install_daemon install_freq
 	@if [ "$(LUA_INTERNAL)" = "true" ]; then \
 		install -m 755 $(TARGET_LUA_LIB) $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE); \
 	fi
+	@install -m 755 $(GOTCHA_FOLDER)/$(TARGET_GOTCHA_LIB) $(LIBPREFIX)/$(TARGET_GOTCHA_LIB).$(VERSION).$(RELEASE)
 	@cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
 	@cd $(LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
 	@cd $(LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
@@ -398,6 +450,8 @@ install: install_daemon install_freq
 		cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)); \
 		cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION); \
 	fi
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_GOTCHA_LIB))
+	@cd $(LIBPREFIX) && ln -fs $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION)
 	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
 	@mkdir -p $(MANPREFIX)/man1
 	@chmod 755 $(MANPREFIX)/man1
@@ -421,6 +475,7 @@ install: install_daemon install_freq
 	@mkdir -p $(PREFIX)/include
 	@chmod 755 $(PREFIX)/include
 	@install -m 644 src/includes/likwid.h  $(PREFIX)/include/
+	@install -m 644 src/includes/likwid-marker.h  $(PREFIX)/include/
 	@install -m 644 src/includes/bstrlib.h  $(PREFIX)/include/
 	$(FORTRAN_INSTALL)
 	@echo "===> INSTALL groups to $(PREFIX)/share/likwid/perfgroups"
@@ -445,7 +500,7 @@ install: install_daemon install_freq
 	done
 	@install -m 644 likwid-config.cmake $(LIBPREFIX)
 
-move: move_daemon move_freq
+move: move_daemon move_freq move_appdaemon
 	@echo "===> MOVE applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
 	@mkdir -p $(INSTALLED_BINPREFIX)
 	@chmod 755 $(INSTALLED_BINPREFIX)
@@ -469,6 +524,7 @@ move: move_daemon move_freq
 	@install -m 755 $(LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(PINLIB).$(VERSION).$(RELEASE)
 	@install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE)
 	@install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE)
+	@install -m 755 $(LIBPREFIX)/$(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE) $(INSTALLED_LIBPREFIX)/$(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE)
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB)
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(TARGET_LIB).$(VERSION).$(RELEASE) $(TARGET_LIB).$(VERSION)
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(PINLIB).$(VERSION).$(RELEASE) $(PINLIB)
@@ -477,6 +533,8 @@ move: move_daemon move_freq
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_HWLOC_LIB)).$(VERSION)
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB))
 	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_LUA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_LUA_LIB)).$(VERSION)
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_GOTCHA_LIB))
+	@cd $(INSTALLED_LIBPREFIX) && ln -fs $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION).$(RELEASE) $(shell basename $(TARGET_GOTCHA_LIB)).$(VERSION)
 	@echo "===> MOVE man pages from $(MANPREFIX)/man1 to $(INSTALLED_MANPREFIX)/man1"
 	@mkdir -p $(INSTALLED_MANPREFIX)/man1
 	@chmod 755 $(INSTALLED_MANPREFIX)/man1
@@ -485,6 +543,7 @@ move: move_daemon move_freq
 	@mkdir -p $(INSTALLED_PREFIX)/include
 	@chmod 755 $(INSTALLED_PREFIX)/include
 	@install -m 644 $(PREFIX)/include/likwid.h $(INSTALLED_PREFIX)/include/likwid.h
+	@install -m 644 $(PREFIX)/include/likwid-marker.h $(INSTALLED_PREFIX)/include/likwid-marker.h
 	@install -m 644 $(PREFIX)/include/bstrlib.h $(INSTALLED_PREFIX)/include/bstrlib.h
 	@if [ -e $(PREFIX)/include/likwid.mod ]; then install $(PREFIX)/include/likwid.mod $(INSTALLED_PREFIX)/include/likwid.mod; fi
 	@echo "===> MOVE groups from $(PREFIX)/share/likwid/perfgroups to $(INSTALLED_PREFIX)/share/likwid/perfgroups"
@@ -507,7 +566,7 @@ move: move_daemon move_freq
 	@chmod 755 $(LIKWIDFILTERPATH)/*
 	@install -m 644 $(LIBPREFIX)/likwid-config.cmake $(INSTALLED_LIBPREFIX)
 
-uninstall: uninstall_daemon uninstall_freq
+uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon
 	@echo "===> REMOVING applications from $(PREFIX)/bin"
 	@rm -f $(addprefix $(BINPREFIX)/,$(addsuffix  .lua,$(L_APPS)))
 	@for APP in $(L_APPS); do \
@@ -523,6 +582,7 @@ uninstall: uninstall_daemon uninstall_freq
 	@rm -rf  $(PREFIX)/share/lua/likwid.lua
 	@echo "===> REMOVING libs from $(LIBPREFIX)"
 	@rm -f $(LIBPREFIX)/liblikwid*
+	@rm -f $(LIBPREFIX)/$(TARGET_GOTCHA_LIB)
 	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
 	@rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix  .1,$(L_APPS)))
 	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
@@ -532,6 +592,7 @@ uninstall: uninstall_daemon uninstall_freq
 	@rm -f $(MANPREFIX)/man1/likwid-bench.1
 	@echo "===> REMOVING header from $(PREFIX)/include"
 	@rm -f $(PREFIX)/include/likwid.h
+	@rm -f $(PREFIX)/include/likwid-marker.h
 	@rm -f $(PREFIX)/include/bstrlib.h
 	$(FORTRAN_REMOVE)
 	@echo "===> REMOVING filter, groups and default configs from $(PREFIX)/share/likwid"
@@ -542,7 +603,7 @@ uninstall: uninstall_daemon uninstall_freq
 	@rm -rf $(PREFIX)/share/likwid
 	@rm -rf $(LIBPREFIX)/likwid-config.cmake
 
-uninstall_moved: uninstall_daemon_moved uninstall_freq_moved
+uninstall_moved: uninstall_daemon_moved uninstall_freq_moved uninstall_appdaemon_moved
 	@echo "===> REMOVING applications from $(INSTALLED_PREFIX)/bin"
 	@rm -f $(addprefix $(INSTALLED_BINPREFIX)/,$(addsuffix  .lua,$(L_APPS)))
 	@for APP in $(L_APPS); do \
@@ -558,6 +619,7 @@ uninstall_moved: uninstall_daemon_moved uninstall_freq_moved
 	@rm -rf  $(INSTALLED_PREFIX)/share/lua/likwid.lua
 	@echo "===> REMOVING libs from $(INSTALLED_LIBPREFIX)"
 	@rm -f $(INSTALLED_LIBPREFIX)/liblikwid*
+	@rm -f $(INSTALLED_LIBPREFIX)/$(TARGET_GOTCHA_LIB)
 	@echo "===> REMOVING man pages from $(INSTALLED_MANPREFIX)/man1"
 	@rm -f $(addprefix $(INSTALLED_MANPREFIX)/man1/,$(addsuffix  .1,$(L_APPS)))
 	@rm -f $(INSTALLED_MANPREFIX)/man1/feedGnuplot.1
@@ -567,6 +629,7 @@ uninstall_moved: uninstall_daemon_moved uninstall_freq_moved
 	@rm -f $(INSTALLED_MANPREFIX)/man1/likwid-bench.1
 	@echo "===> REMOVING header from $(INSTALLED_PREFIX)/include"
 	@rm -f $(INSTALLED_PREFIX)/include/likwid.h
+	@rm -f $(PREFIX)/include/likwid-marker.h
 	@rm -f $(INSTALLED_PREFIX)/include/bstrlib.h
 	$(FORTRAN_REMOVE)
 	@echo "===> REMOVING filter, groups and default configs from $(INSTALLED_PREFIX)/share/likwid"
@@ -591,6 +654,8 @@ local: $(L_APPS) likwid.lua
 	@ln -sf $(HWLOC_FOLDER)/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION).$(RELEASE)
 	@ln -sf $(LUA_FOLDER)/liblikwid-lua.so liblikwid-lua.so.$(VERSION)
 	@ln -sf $(LUA_FOLDER)/liblikwid-lua.so liblikwid-lua.so.$(VERSION).$(RELEASE)
+	@ln -sf $(GOTCHA_FOLDER)/liblikwid-gotcha.so liblikwid-gotcha.so.$(VERSION)
+	@ln -sf $(GOTCHA_FOLDER)/liblikwid-gotcha.so liblikwid-gotcha.so.$(VERSION).$(RELEASE)
 	@if [ -e $(LUA_FOLDER)/liblikwid-lua.so ]; then ln -sf $(LUA_FOLDER)/liblikwid-lua.so liblikwid-lua.so.$(VERSION).$(RELEASE); fi
 	@if [ -e $(HWLOC_FOLDER)/liblikwid-hwloc.so ]; then ln -sf $(HWLOC_FOLDER)/liblikwid-hwloc.so liblikwid-hwloc.so.$(VERSION).$(RELEASE); fi
 	@if [ -e $(PINLIB) ]; then ln -sf $(PINLIB) $(PINLIB).$(VERSION).$(RELEASE); fi
diff --git a/README.md b/README.md
index f854377ff..196463d91 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,9 @@ Introduction
 --------------------------------------------------------------------------------
 
 Likwid is a simple to install and use toolsuite of command line applications
-for performance oriented programmers. It works for Intel, AMD and ARMv8 processors
-on the Linux operating system.
+for performance oriented programmers. It works for Intel, AMD, ARMv8 and POWER9 
+processors on the Linux operating system. There is support for ARMv7 and POWER8
+but there is currently no test machine in our hands to test them properly.
 
 [![Build Status](https://travis-ci.org/RRZE-HPC/likwid.svg?branch=master)](https://travis-ci.org/RRZE-HPC/likwid)
 
@@ -61,9 +62,16 @@ AMD
 - AMD Interlagos
 - AMD Kabini
 - AMD Zen
+- AMD Zen2
 
-ARMv8 (experimental)
-- Tested on Marvell Thunder X2
+ARM (experimental)
+- ARMv7
+- ARMv8
+- Special support for Marvell Thunder X2
+
+POWER (experimental)
+- IBM POWER8
+- IBM POWER9
 
 --------------------------------------------------------------------------------
 Download, Build and Install
@@ -105,7 +113,7 @@ https://github.com/rrze-likwid/likwid/issues
 Extras
 --------------------------------------------------------------------------------
 - If you want to use the Marker API with Java, you can find the Java module here:
-https://github.com/jlewandowski/likwid-java-api
+https://github.com/jacek-lewandowski/likwid-java-api
 - For Python you can find an interface to the LIKWID API here:
 https://github.com/RRZE-HPC/pylikwid or `pip install pylikwid`
 
diff --git a/bench/Makefile b/bench/Makefile
index d2acd57b8..78100eb20 100644
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -63,6 +63,9 @@ endif
 ifeq ($(COMPILER),GCCARMv8)
 BENCH_DIR   = ./armv8
 endif
+ifeq ($(COMPILER),GCCPOWER)
+BENCH_DIR   = ./power
+endif
 
 
 
diff --git a/bench/armv8/daxpy.ptt b/bench/armv8/daxpy.ptt
new file mode 100644
index 000000000..945adbddb
--- /dev/null
+++ b/bench/armv8/daxpy.ptt
@@ -0,0 +1,15 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision linear combination of two vectors, only scalar operations
+LOADS 2
+STORES 1
+INSTR_LOOP 4
+fmov     FPR7, ARG1
+LOOP 1
+ldr      FPR1, [STR0], #8
+fmul     FPR1, FPR1, FPR7
+ldr      FPR2, [STR1]
+fadd     FPR1, FPR1, FPR2
+str      FPR1, [STR1], #8
diff --git a/bench/armv8/triad.ptt b/bench/armv8/triad.ptt
new file mode 100644
index 000000000..94572be3c
--- /dev/null
+++ b/bench/armv8/triad.ptt
@@ -0,0 +1,16 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+DESC Double-precision triad A(i) = B(i) * C(i) + D(i), only scalar operations
+LOADS 3
+STORES 1
+INSTR_LOOP 6
+LOOP 1
+ldr   D1, [STR1], #8
+ldr   D2, [STR2], #8
+ldr   D3, [STR3], #8
+fmul  D1, D1, D2
+fadd  D1, D1, D3
+str   D1, [STR0], #8
+
diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h
index bb1da2360..658693d25 100644
--- a/bench/includes/allocator.h
+++ b/bench/includes/allocator.h
@@ -45,6 +45,7 @@ extern void allocator_allocateVector(void** ptr,
                 int offset,
                 DataType type,
                 int stride,
-                bstring domain);
+                bstring domain,
+                int init_per_thread);
 
 #endif /*ALLOCATOR_H*/
diff --git a/bench/includes/bstrlib_helper.h b/bench/includes/bstrlib_helper.h
new file mode 120000
index 000000000..2536c9b83
--- /dev/null
+++ b/bench/includes/bstrlib_helper.h
@@ -0,0 +1 @@
+../../src/includes/bstrlib_helper.h
\ No newline at end of file
diff --git a/bench/includes/isa_armv7.h b/bench/includes/isa_armv7.h
new file mode 100644
index 000000000..e3538d23b
--- /dev/null
+++ b/bench/includes/isa_armv7.h
@@ -0,0 +1,214 @@
+/*
+ * =======================================================================================
+ *      Filename:  isa_armv7.h
+ *
+ *      Description:  Definitions used for dynamically compile benchmarks for ARMv7 systems
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_BENCH_ISA_ARMV7_H
+#define LIKWID_BENCH_ISA_ARMV7_H
+
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
+#define ARCHNAME "armv7"
+#define WORDLENGTH 4
+
+int header(struct bstrList* code, char* funcname)
+{
+    bstring glline;
+    bstring typeline;
+    bstring label;
+    if (funcname)
+    {
+        glline = bformat(".global %s", funcname);
+        typeline = bformat(".type %s, \%function", funcname);
+        label = bformat("%s :", funcname);
+    }
+    else
+    {
+        glline = bformat(".global kernelfunction");
+        typeline = bformat(".type kernelfunction, \%function");
+        label = bformat("kernelfunction :");
+    }
+
+
+    bstrListAddChar(code, ".cpu    cortex-a15\n.fpu    neon-vfpv4");
+    bstrListAddChar(code, ".data");
+    bstrListAddChar(code, ".text");
+    bstrListAdd(code, glline);
+    bstrListAdd(code, typeline);
+    bstrListAdd(code, label);
+    bstrListAddChar(code, "push     {r4-r7, lr}");
+    bstrListAddChar(code, "add      r7, sp, #12");
+    bstrListAddChar(code, "push     {r8, r10, r11}");
+    bstrListAddChar(code, "vstmdb   sp!, {d8-d15}");
+
+
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(glline);
+    bdestroy(typeline);
+    bdestroy(label);
+    return 0;
+}
+
+int footer(struct bstrList* code, char* funcname)
+{
+    bstring line;
+    if (funcname)
+    {
+        line = bformat(".size %s, .-%s", funcname, funcname);
+    }
+    else
+    {
+        line = bformat(".size kernelfunction, .-kernelfunction");
+    }
+    bstrListAddChar(code, "vldmia   sp!, {d8-d15}");
+    bstrListAddChar(code, "pop      {r8, r10, r11}");
+    bstrListAddChar(code, "pop      {r4-r7, pc}");
+
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bstrListAddChar(code, "#if defined(__linux__) && defined(__ELF__)");
+    bstrListAddChar(code, ".section .note.GNU-stack,\"\",%progbits");
+    bstrListAddChar(code, "#endif");
+
+    bdestroy(line);
+}
+
+int loopheader(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("%s:", loopname);
+    }
+    else
+    {
+        line = bformat("kernelfunctionloop:");
+    }
+
+    bstrListAddChar(code, "mov   GPR4, #0");
+    bstrListAddChar(code, ".align 2");
+    bstrListAdd(code, line);
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+int loopfooter(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("blt %sb", loopname);
+    }
+    else
+    {
+        line = bformat("blt kernelfunctionloopb");
+    }
+    bstring bstep = bformat("add GPR4, #%d", step);
+    bstrListAdd(code, bstep);
+    bdestroy(bstep);
+    bstrListAddChar(code, "cmp GPR4, GPR1");
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+
+static RegisterMap Registers[] = {
+    {"GPR1", "r0"},
+    {"GPR2", "r1"},
+    {"GPR3", "r2"},
+    {"GPR4", "r3"},
+    {"GPR5", "r4"},
+    {"GPR6", "r5"},
+    {"GPR7", "r6"},
+    {"GPR8", "r7"},
+    {"GPR9", "r8"},
+    {"GPR10", "r9"},
+    {"GPR11", "r10"},
+    {"GPR12", "r11"},
+    {"GPR13", "r12"},
+    {"GPR14", "r13"},
+    {"GPR15", "r14"},
+    {"GPR16", "r15"},
+    {"FPR1", "d0"},
+    {"FPR2", "d1"},
+    {"FPR3", "d2"},
+    {"FPR4", "d3"},
+    {"FPR5", "d4"},
+    {"FPR6", "d5"},
+    {"FPR7", "d6"},
+    {"FPR8", "d7"},
+    {"FPR9", "d8"},
+    {"FPR10", "d9"},
+    {"FPR11", "d10"},
+    {"FPR12", "d11"},
+    {"FPR13", "d12"},
+    {"FPR14", "d13"},
+    {"FPR15", "d14"},
+    {"FPR16", "d15"},
+    {"", ""},
+};
+
+static RegisterMap Arguments[] = {
+    {"ARG1", "r0"},
+    {"ARG2", "r1"},
+    {"ARG3", "r2"},
+    {"ARG4", "r3"},
+    {"ARG7", "[SPTR+8]"},
+    {"ARG8", "[SPTR+12]"},
+    {"ARG9", "[SPTR+16]"},
+    {"ARG10", "[SPTR+20]"},
+    {"ARG11", "[SPTR+24]"},
+    {"ARG12", "[SPTR+28]"},
+    {"ARG13", "[SPTR+32]"},
+    {"ARG14", "[SPTR+36]"},
+    {"ARG15", "[SPTR+40]"},
+    {"ARG16", "[SPTR+44]"},
+    {"ARG17", "[SPTR+48]"},
+    {"ARG18", "[SPTR+52]"},
+    {"ARG19", "[SPTR+56]"},
+    {"ARG20", "[SPTR+60]"},
+    {"ARG21", "[SPTR+64]"},
+    {"ARG22", "[SPTR+68]"},
+    {"ARG23", "[SPTR+72]"},
+    {"ARG24", "[SPTR+76]"},
+    {"", ""},
+};
+
+static RegisterMap Sptr = {"SPTR", "sp"};
+static RegisterMap Bptr = {"BPTR", "rbp"};
+
+#endif
diff --git a/bench/includes/isa_armv8.h b/bench/includes/isa_armv8.h
new file mode 100644
index 000000000..e1e14411e
--- /dev/null
+++ b/bench/includes/isa_armv8.h
@@ -0,0 +1,214 @@
+/*
+ * =======================================================================================
+ *      Filename:  isa_armv8.h
+ *
+ *      Description:  Definitions used for dynamically compile benchmarks for ARMv8 systems
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_BENCH_ISA_ARMV8_H
+#define LIKWID_BENCH_ISA_ARMV8_H
+
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
+#define ARCHNAME "armv8"
+#define WORDLENGTH 4
+
+int header(struct bstrList* code, char* funcname)
+{
+    bstring glline;
+    bstring typeline;
+    bstring label;
+    if (funcname)
+    {
+        glline = bformat(".global %s", funcname);
+        typeline = bformat(".type %s, @function", funcname);
+        label = bformat("%s :", funcname);
+    }
+    else
+    {
+        glline = bformat(".global kernelfunction");
+        typeline = bformat(".type kernelfunction, @function");
+        label = bformat("kernelfunction :");
+    }
+
+
+    bstrListAddChar(code, ".cpu    generic+fp+simd");
+    bstrListAddChar(code, ".data");
+    bstrListAddChar(code, ".text");
+    bstrListAdd(code, glline);
+    bstrListAdd(code, typeline);
+    bstrListAdd(code, label);
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(glline);
+    bdestroy(typeline);
+    bdestroy(label);
+    return 0;
+}
+
+int footer(struct bstrList* code, char* funcname)
+{
+    bstring line;
+    if (funcname)
+    {
+        line = bformat(".size %s, .-%s", funcname, funcname);
+    }
+    else
+    {
+        line = bformat(".size kernelfunction, .-kernelfunction");
+    }
+    bstrListAddChar(code, ".exit:");
+    bstrListAddChar(code, "ret");
+
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bstrListAddChar(code, "#if defined(__linux__) && defined(__ELF__)");
+    bstrListAddChar(code, ".section .note.GNU-stack,\"\",%progbits");
+    bstrListAddChar(code, "#endif");
+
+    bdestroy(line);
+}
+
+int loopheader(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("%s:", loopname);
+    }
+    else
+    {
+        line = bformat("kernelfunctionloop:");
+    }
+
+    bstrListAddChar(code, "mov   GPR6, 0");
+    bstrListAdd(code, line);
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+int loopfooter(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("tblt %s", loopname);
+    }
+    else
+    {
+        line = bformat("tblt kernelfunctionloop");
+    }
+    bstring bstep = bformat("add GPR6, GPR6, #%d", step);
+    bstrListAdd(code, bstep);
+    bdestroy(bstep);
+    bstrListAddChar(code, "cmp   GPR6, ARG1");
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+
+static RegisterMap Registers[] = {
+    {"GPR1", "x1"},
+    {"GPR2", "x2"},
+    {"GPR3", "x3"},
+    {"GPR4", "x4"},
+    {"GPR5", "x5"},
+    {"GPR6", "x6"},
+    {"GPR7", "x7"},
+    {"GPR8", "x8"},
+    {"GPR9", "x9"},
+    {"GPR10", "x10"},
+    {"GPR11", "x11"},
+    {"GPR12", "x12"},
+    {"GPR13", "x13"},
+    {"GPR14", "x14"},
+    {"GPR15", "x15"},
+    {"GPR16", "x16"},
+    {"GPR17", "x17"},
+    {"GPR18", "x18"},
+    {"GPR19", "x19"},
+    {"GPR20", "x20"},
+    {"GPR21", "x21"},
+    {"GPR22", "x22"},
+    {"FPR1", "d0"},
+    {"FPR2", "d1"},
+    {"FPR3", "d2"},
+    {"FPR4", "d3"},
+    {"FPR5", "d4"},
+    {"FPR6", "d5"},
+    {"FPR7", "d6"},
+    {"FPR8", "d7"},
+    {"FPR9", "d8"},
+    {"FPR10", "d9"},
+    {"FPR11", "d10"},
+    {"FPR12", "d11"},
+    {"FPR13", "d12"},
+    {"FPR14", "d13"},
+    {"FPR15", "d14"},
+    {"FPR16", "d15"},
+    {"", ""},
+};
+
+static RegisterMap Arguments[] = {
+    {"ARG1", "x0"},
+    {"ARG2", "x1"},
+    {"ARG3", "x2"},
+    {"ARG4", "x3"},
+    {"ARG5", "x4"},
+    {"ARG6", "x5"},
+    {"ARG7", "x6"},
+    {"ARG8", "x7"},
+    {"ARG9", "[SPTR+32]"},
+    {"ARG10", "[SPTR+40]"},
+    {"ARG11", "[SPTR+48]"},
+    {"ARG12", "[SPTR+56]"},
+    {"ARG13", "[SPTR+64]"},
+    {"ARG14", "[SPTR+72]"},
+    {"ARG15", "[SPTR+80]"},
+    {"ARG16", "[SPTR+88]"},
+    {"ARG17", "[SPTR+96]"},
+    {"ARG18", "[SPTR+104]"},
+    {"ARG19", "[SPTR+112]"},
+    {"ARG20", "[SPTR+120]"},
+    {"ARG21", "[SPTR+128]"},
+    {"ARG22", "[SPTR+136]"},
+    {"ARG23", "[SPTR+144]"},
+    {"ARG24", "[SPTR+152]"},
+    {"", ""},
+};
+
+static RegisterMap Sptr = {"SPTR", "sp"};
+static RegisterMap Bptr = {"BPTR", "rbp"};
+
+#endif
diff --git a/bench/includes/isa_x86-64.h b/bench/includes/isa_x86-64.h
new file mode 100644
index 000000000..2138268ca
--- /dev/null
+++ b/bench/includes/isa_x86-64.h
@@ -0,0 +1,226 @@
+/*
+ * =======================================================================================
+ *      Filename:  isa_x86-64.h
+ *
+ *      Description:  Definitions used for dynamically compile benchmarks for x86-64 systems
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_BENCH_ISA_X8664_H
+#define LIKWID_BENCH_ISA_X8664_H
+
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
+#define ARCHNAME "x86-64"
+
+
+int header(struct bstrList* code, char* funcname)
+{
+    bstring glline;
+    bstring typeline;
+    bstring label;
+    if (funcname)
+    {
+        glline = bformat(".global %s", funcname);
+        typeline = bformat(".type %s, @function", funcname);
+        label = bformat("%s :", funcname);
+    }
+    else
+    {
+        glline = bformat(".global kernelfunction");
+        typeline = bformat(".type kernelfunction, @function");
+        label = bformat("kernelfunction :");
+    }
+
+
+    bstrListAddChar(code, ".intel_syntax noprefix");
+    bstrListAddChar(code, ".data");
+    bstrListAddChar(code, ".align 64\nSCALAR:\n.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0");
+    bstrListAddChar(code, ".align 64\nSSCALAR:\n.single 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0");
+    bstrListAddChar(code, ".align 64\nISCALAR:\n.int 1, 1, 1, 1, 1, 1, 1, 1");
+    bstrListAddChar(code, ".align 16\nOMM:\n.int 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15");
+    bstrListAddChar(code, ".align 16\nIOMM:\n.int 0,16,32,48,64,80,96,128,144,160,176,192,208,224,240,256");
+    bstrListAddChar(code, ".align 16\nTOMM:\n.int 0,2,4,6,16,18,20,22,32,34,36,38,48,50,52,54");
+    bstrListAddChar(code, ".text");
+    bstrListAdd(code, glline);
+    bstrListAdd(code, typeline);
+    bstrListAdd(code, label);
+    bstrListAddChar(code, "push rbp");
+    bstrListAddChar(code, "mov rbp, rsp");
+    bstrListAddChar(code, "push rbx");
+    bstrListAddChar(code, "push r12");
+    bstrListAddChar(code, "push r13");
+    bstrListAddChar(code, "push r14");
+    bstrListAddChar(code, "push r15");
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(glline);
+    bdestroy(typeline);
+    bdestroy(label);
+    return 0;
+}
+
+int footer(struct bstrList* code, char* funcname)
+{
+    bstring line;
+    if (funcname)
+    {
+        line = bformat(".size %s, .-%s", funcname, funcname);
+    }
+    else
+    {
+        line = bformat(".size kernelfunction, .-kernelfunction");
+    }
+    bstrListAddChar(code, "pop r15");
+    bstrListAddChar(code, "pop r14");
+    bstrListAddChar(code, "pop r13");
+    bstrListAddChar(code, "pop r12");
+    bstrListAddChar(code, "pop rbx");
+    bstrListAddChar(code, "mov  rsp, rbp");
+    bstrListAddChar(code, "pop rbp");
+    bstrListAddChar(code, "ret");
+
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bstrListAddChar(code, "#if defined(__linux__) && defined(__ELF__)");
+    bstrListAddChar(code, ".section .note.GNU-stack,\"\",%progbits");
+    bstrListAddChar(code, "#endif");
+
+    bdestroy(line);
+}
+
+int loopheader(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("%s:", loopname);
+    }
+    else
+    {
+        line = bformat("kernelfunctionloop:");
+    }
+
+    bstrListAddChar(code, "xor   GPR1, GPR1");
+    bstrListAddChar(code, ".align 16");
+    bstrListAdd(code, line);
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+int loopfooter(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("jl %sb", loopname);
+    }
+    else
+    {
+        line = bformat("jl kernelfunctionloopb");
+    }
+    bstring bstep = bformat("add GPR1, %d", step);
+    bstrListAdd(code, bstep);
+    bdestroy(bstep);
+    bstrListAddChar(code, "cmp   GPR1, ARG1");
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+
+static RegisterMap Registers[] = {
+    {"GPR1", "rax"},
+    {"GPR2", "rbx"},
+    {"GPR3", "rcx"},
+    {"GPR4", "rdx"},
+    {"GPR5", "rsi"},
+    {"GPR6", "rdi"},
+    {"GPR7", "r8"},
+    {"GPR8", "r9"},
+    {"GPR9", "r10"},
+    {"GPR10", "r11"},
+    {"GPR11", "r12"},
+    {"GPR12", "r13"},
+    {"GPR13", "r14"},
+    {"GPR14", "r15"},
+    {"FPR1", "xmm0"},
+    {"FPR2", "xmm1"},
+    {"FPR3", "xmm2"},
+    {"FPR4", "xmm3"},
+    {"FPR5", "xmm4"},
+    {"FPR6", "xmm5"},
+    {"FPR7", "xmm6"},
+    {"FPR8", "xmm7"},
+    {"FPR9", "xmm8"},
+    {"FPR10", "xmm9"},
+    {"FPR11", "xmm10"},
+    {"FPR12", "xmm11"},
+    {"FPR13", "xmm12"},
+    {"FPR14", "xmm13"},
+    {"FPR15", "xmm14"},
+    {"FPR16", "xmm15"},
+    {"", ""},
+};
+
+static RegisterMap Arguments[] = {
+    {"ARG1", "rdi"},
+    {"ARG2", "rsi"},
+    {"ARG3", "rdx"},
+    {"ARG4", "rcx"},
+    {"ARG5", "r8"},
+    {"ARG6", "r9"},
+    {"ARG7", "[BPTR+16]"},
+    {"ARG8", "[BPTR+24]"},
+    {"ARG9", "[BPTR+32]"},
+    {"ARG10", "[BPTR+40]"},
+    {"ARG11", "[BPTR+48]"},
+    {"ARG12", "[BPTR+56]"},
+    {"ARG13", "[BPTR+64]"},
+    {"ARG14", "[BPTR+72]"},
+    {"ARG15", "[BPTR+80]"},
+    {"ARG16", "[BPTR+88]"},
+    {"ARG17", "[BPTR+96]"},
+    {"ARG18", "[BPTR+104]"},
+    {"ARG19", "[BPTR+112]"},
+    {"ARG20", "[BPTR+120]"},
+    {"ARG21", "[BPTR+128]"},
+    {"ARG22", "[BPTR+136]"},
+    {"ARG23", "[BPTR+144]"},
+    {"ARG24", "[BPTR+152]"},
+    {"", ""},
+};
+
+static RegisterMap Sptr = {"SPTR", "rsp"};
+static RegisterMap Bptr = {"BPTR", "rbp"};
+
+#endif
diff --git a/bench/includes/isa_x86.h b/bench/includes/isa_x86.h
new file mode 100644
index 000000000..21350fcc3
--- /dev/null
+++ b/bench/includes/isa_x86.h
@@ -0,0 +1,204 @@
+/*
+ * =======================================================================================
+ *      Filename:  isa_x86.h
+ *
+ *      Description:  Definitions used for dynamically compile benchmarks for x86 systems
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_BENCH_ISA_X86_H
+#define LIKWID_BENCH_ISA_X86_H
+
+
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
+#define ARCHNAME "x86"
+#define WORDLENGTH 4
+
+
+int header(struct bstrList* code, char* funcname)
+{
+    bstring glline;
+    bstring typeline;
+    bstring label;
+    if (funcname)
+    {
+        glline = bformat(".global %s", funcname);
+        typeline = bformat(".type %s, @function", funcname);
+        label = bformat("%s :", funcname);
+    }
+    else
+    {
+        glline = bformat(".global kernelfunction");
+        typeline = bformat(".type kernelfunction, @function");
+        label = bformat("kernelfunction :");
+    }
+
+
+    bstrListAddChar(code, ".intel_syntax noprefix");
+    bstrListAddChar(code, ".data");
+    bstrListAddChar(code, ".align 64\nSCALAR:\n.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0");
+    bstrListAddChar(code, ".align 64\nSSCALAR:\n.single 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0");
+    bstrListAddChar(code, ".align 64\nISCALAR:\n.int 1, 1, 1, 1, 1, 1, 1, 1");
+    bstrListAddChar(code, ".align 16\nOMM:\n.int 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15");
+    bstrListAddChar(code, ".align 16\nIOMM:\n.int 0,16,32,48,64,80,96,128,144,160,176,192,208,224,240,256");
+    bstrListAddChar(code, ".align 16\nTOMM:\n.int 0,2,4,6,16,18,20,22,32,34,36,38,48,50,52,54");
+    bstrListAddChar(code, ".text");
+    bstrListAdd(code, glline);
+    bstrListAdd(code, typeline);
+    bstrListAdd(code, label);
+
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(glline);
+    bdestroy(typeline);
+    bdestroy(label);
+    return 0;
+}
+
+int footer(struct bstrList* code, char* funcname)
+{
+    bstring line;
+    if (funcname)
+    {
+        line = bformat(".size %s, .-%s", funcname, funcname);
+    }
+    else
+    {
+        line = bformat(".size kernelfunction, .-kernelfunction");
+    }
+    bstrListAddChar(code, "pop edi");
+    bstrListAddChar(code, "pop esi");
+    bstrListAddChar(code, "pop ebx");
+    bstrListAddChar(code, "mov  esp, ebp");
+    bstrListAddChar(code, "pop ebp");
+    bstrListAddChar(code, "ret");
+
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bstrListAddChar(code, "#if defined(__linux__) && defined(__ELF__)");
+    bstrListAddChar(code, ".section .note.GNU-stack,\"\",%progbits");
+    bstrListAddChar(code, "#endif");
+
+    bdestroy(line);
+}
+
+int loopheader(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("%s:", loopname);
+    }
+    else
+    {
+        line = bformat("kernelfunctionloop:");
+    }
+
+    bstrListAddChar(code, "xor   GPR1, GPR1");
+    bstrListAddChar(code, ".align 16");
+    bstrListAdd(code, line);
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+int loopfooter(struct bstrList* code, char* loopname, int step)
+{
+    bstring line;
+    if (loopname)
+    {
+        line = bformat("jl %sb", loopname);
+    }
+    else
+    {
+        line = bformat("jl kernelfunctionloopb");
+    }
+    bstring bstep = bformat("add GPR1, %d", step);
+    bstrListAdd(code, bstep);
+    bdestroy(bstep);
+    bstrListAddChar(code, "cmp   GPR1, ARG1");
+    bstrListAdd(code, line);
+
+    bstrListAddChar(code, "\n");
+
+    bdestroy(line);
+    return 0;
+}
+
+
+static RegisterMap Registers[] = {
+    {"GPR1", "eax"},
+    {"GPR2", "ebx"},
+    {"GPR3", "ecx"},
+    {"GPR4", "edx"},
+    {"GPR5", "esi"},
+    {"GPR6", "edi"},
+    {"FPR1", "xmm0"},
+    {"FPR2", "xmm1"},
+    {"FPR3", "xmm2"},
+    {"FPR4", "xmm3"},
+    {"FPR5", "xmm4"},
+    {"FPR6", "xmm5"},
+    {"FPR7", "xmm6"},
+    {"FPR8", "xmm7"},
+    {"", ""},
+};
+
+static RegisterMap Arguments[] = {
+    {"ARG1", "rdi"},
+    {"ARG2", "rsi"},
+    {"ARG3", "rdx"},
+    {"ARG4", "rcx"},
+    {"ARG5", "r8"},
+    {"ARG6", "r9"},
+    {"ARG7", "[BPTR+8]"},
+    {"ARG8", "[BPTR+12]"},
+    {"ARG9", "[BPTR+16]"},
+    {"ARG10", "[BPTR+20]"},
+    {"ARG11", "[BPTR+24]"},
+    {"ARG12", "[BPTR+28]"},
+    {"ARG13", "[BPTR+32]"},
+    {"ARG14", "[BPTR+36]"},
+    {"ARG15", "[BPTR+40]"},
+    {"ARG16", "[BPTR+44]"},
+    {"ARG17", "[BPTR+48]"},
+    {"ARG18", "[BPTR+52]"},
+    {"ARG19", "[BPTR+56]"},
+    {"ARG20", "[BPTR+60]"},
+    {"ARG21", "[BPTR+64]"},
+    {"ARG22", "[BPTR+68]"},
+    {"ARG23", "[BPTR+72]"},
+    {"ARG24", "[BPTR+76]"},
+    {"", ""},
+};
+
+static RegisterMap Sptr = {"SPTR", "esp"};
+static RegisterMap Bptr = {"BPTR", "ebp"};
+
+#endif
diff --git a/bench/includes/ptt2asm.h b/bench/includes/ptt2asm.h
new file mode 100644
index 000000000..6876f31aa
--- /dev/null
+++ b/bench/includes/ptt2asm.h
@@ -0,0 +1,88 @@
+/*
+ * =======================================================================================
+ *      Filename:  ptt2asm.h
+ *
+ *      Description:  The interface to dynamically load ptt files
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:  Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_BENCH_PTT2ASM_H
+#define LIKWID_BENCH_PTT2ASM_H
+
+typedef struct {
+    char* pattern;
+    char* reg;
+} RegisterMap;
+
+static RegisterMap StreamPatterns[] = {
+    {"STR0", "ARG2"},
+    {"STR1", "ARG3"},
+    {"STR2", "ARG4"},
+    {"STR3", "ARG5"},
+    {"STR4", "ARG6"},
+    {"STR5", "[rbp+16]"},
+    {"STR6", "[rbp+24]"},
+    {"STR7", "[rbp+32]"},
+    {"STR8", "[rbp+40]"},
+    {"STR9", "[rbp+48]"},
+    {"STR10", "[rbp+56]"},
+    {"STR11", "[rbp+64]"},
+    {"STR12", "[rbp+72]"},
+    {"STR13", "[rbp+80]"},
+    {"STR14", "[rbp+88]"},
+    {"STR15", "[rbp+96]"},
+    {"STR16", "[rbp+104]"},
+    {"STR17", "[rbp+112]"},
+    {"STR18", "[rbp+120]"},
+    {"STR19", "[rbp+128]"},
+    {"STR20", "[rbp+136]"},
+    {"STR21", "[rbp+144]"},
+    {"STR22", "[rbp+152]"},
+    {"STR23", "[rbp+160]"},
+    {"STR24", "[rbp+168]"},
+    {"STR25", "[rbp+176]"},
+    {"STR26", "[rbp+184]"},
+    {"STR27", "[rbp+192]"},
+    {"STR28", "[rbp+200]"},
+    {"STR29", "[rbp+208]"},
+    {"STR30", "[rbp+216]"},
+    {"STR31", "[rbp+224]"},
+    {"STR32", "[rbp+232]"},
+    {"STR33", "[rbp+240]"},
+    {"STR34", "[rbp+248]"},
+    {"STR35", "[rbp+256]"},
+    {"STR36", "[rbp+264]"},
+    {"STR37", "[rbp+272]"},
+    {"STR38", "[rbp+280]"},
+    {"STR39", "[rbp+288]"},
+    {"STR40", "[rbp+296]"},
+    {"", ""},
+};
+
+struct bstrList* dynbench_getall();
+
+int dynbench_test(bstring testname);
+int dynbench_load(bstring testname, TestCase **testcase, char* tmpfolder, char *compilers, char* compileflags);
+int dynbench_close(TestCase* testcase, char* tmpfolder);
+
+#endif
diff --git a/bench/includes/strUtil.h b/bench/includes/strUtil.h
index 66722373d..2b197c11c 100644
--- a/bench/includes/strUtil.h
+++ b/bench/includes/strUtil.h
@@ -50,6 +50,7 @@ typedef struct {
     uint32_t numberOfThreads;
     int* processorIds;
     uint64_t size;
+    int init_per_thread;
     Stream* streams;
 } Workgroup;
 
diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h
index b4080d1ef..652a7b6b3 100644
--- a/bench/includes/test_types.h
+++ b/bench/includes/test_types.h
@@ -41,6 +41,7 @@ typedef enum {
     INT} DataType;
 
 typedef enum {
+    STREAM_0 = 0,
     STREAM_1 = 1,
     STREAM_2,
     STREAM_3,
@@ -96,6 +97,8 @@ typedef struct {
     int instr_const;
     int instr_loop;
     int uops;
+    int loadstores;
+    void* dlhandle;
 } TestCase;
 
 typedef struct {
@@ -105,6 +108,7 @@ typedef struct {
     const TestCase* test;
     uint64_t   cycles;
     uint32_t numberOfThreads;
+    int    init_per_thread;
     int* processors;
     void** streams;
 } ThreadUserData;
diff --git a/bench/includes/threads.h b/bench/includes/threads.h
index f0953b589..94d964fa1 100644
--- a/bench/includes/threads.h
+++ b/bench/includes/threads.h
@@ -32,6 +32,7 @@
 
 #include <pthread.h>
 #include <threads_types.h>
+#include <strUtil.h>
 
 #define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
 #define MIN_ITERATIONS 10
@@ -107,7 +108,8 @@ extern void threads_destroy(int numberOfGroups, int numberOfStreams);
 /**
  * @brief  Create Thread groups
  * @param  numberOfGroups The number of groups to create
+ * @param  groups Pointer to the groups data
  */
-extern void threads_createGroups(int numberOfGroups);
+extern void threads_createGroups(int numberOfGroups, Workgroup *groups);
 
 #endif /* THREADS_H */
diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c
index 0268ed367..cbdf087c3 100644
--- a/bench/likwid-bench.c
+++ b/bench/likwid-bench.c
@@ -39,6 +39,7 @@
 #include <ctype.h>
 #include <inttypes.h>
 #include <math.h>
+#include <signal.h>
 
 #include <bstrlib.h>
 #include <errno.h>
@@ -47,8 +48,10 @@
 #include <testcases.h>
 #include <strUtil.h>
 #include <allocator.h>
+#include <ptt2asm.h>
 
 #include <likwid.h>
+#include <likwid-marker.h>
 
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
@@ -72,7 +75,14 @@ extern void* getIterSingle(void* arg);
     printf("-l <TEST>\t list properties of benchmark \n"); \
     printf("-t <TEST>\t type of test \n"); \
     printf("-w\t\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>]-<streamId>:<domain_id>[:<offset>]\n"); \
-    printf("\t\t <size> in kB, MB or GB  (mandatory)\n"); \
+    printf("-W\t\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>]]\n"); \
+    printf("\t\t <size> in kB, MB or GB (mandatory)\n"); \
+    printf("For dynamically loaded benchmarks\n"); \
+    printf("-f <PATH>\t Specify a folder for the temporary files. default: /tmp\n"); \
+    printf("\n"); \
+    printf("Difference between -w and -W :\n"); \
+    printf("-w allocates the streams in the thread_domain with one thread and support placement of streams\n"); \
+    printf("-W allocates the streams chunk-wise by each thread in the thread_domain\n"); \
     printf("\n"); \
     printf("Usage: \n"); \
     printf("# Run the store benchmark on all CPUs of the system with a vector size of 1 GB\n"); \
@@ -81,6 +91,8 @@ extern void* getIterSingle(void* arg);
     printf("likwid-bench -t copy -w S0:100kB:1\n"); \
     printf("# Run the copy benchmark on one CPU at CPU socket 0 with a vector size of 100MB but place one stream on CPU socket 1\n"); \
     printf("likwid-bench -t copy -w S0:100MB:1-0:S0,1:S1\n"); \
+/*    printf("-c <COMP_LIST>\t Specify a list of compilers that should be searched for. default: gcc,icc,pgcc\n"); \*/
+/*    printf("-f <COMP_FLAGS>\t Specify compiler flags. Use \". default: \"-shared -fPIC\"\n"); \*/
 
 #define VERSION_MSG \
     printf("likwid-bench -- Version %d.%d.%d\n",VERSION,RELEASE,MINORVERSION); \
@@ -108,6 +120,13 @@ copyThreadData(ThreadUserData* src,ThreadUserData* dst)
 }
 
 
+void illhandler(int signum, siginfo_t *info, void *ptr)
+{
+    fprintf(stderr, "ERROR: Illegal instruction\n");
+    fprintf(stderr, "This happens if you want to run a kernel that uses instructions not available on your system.\n");
+    exit(EXIT_FAILURE);
+}
+
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
@@ -126,7 +145,7 @@ int main(int argc, char** argv)
     double time;
     double cycPerUp = 0.0;
     double cycPerCL = 0.0;
-    const TestCase* test = NULL;
+    TestCase* test = NULL;
     uint64_t realSize = 0;
     uint64_t realIter = 0;
     uint64_t maxCycles = 0;
@@ -141,8 +160,17 @@ int main(int argc, char** argv)
     binsertch(HLINE, 0, 80, '-');
     binsertch(HLINE, 80, 1, '\n');
     int (*ownprintf)(const char *format, ...);
+#ifdef _ARCH_PPC
+    int clsize = 128;
+#else
     int clsize = sysconf (_SC_LEVEL1_DCACHE_LINESIZE);
+#endif
+    char compilers[512] = "gcc,icc,pgcc";
+    char defcompilepath[512] = "/tmp";
+    char compilepath[513] = "";
+    char compileflags[512] = "-shared -fPIC";
     ownprintf = &printf;
+    struct sigaction sig;
 
     /* Handling of command line options */
     if (argc ==  1)
@@ -151,7 +179,23 @@ int main(int argc, char** argv)
         exit(EXIT_SUCCESS);
     }
 
-    while ((c = getopt (argc, argv, "w:t:s:l:aphvi:")) != -1) {
+    while ((c = getopt (argc, argv, "W:w:t:s:l:aphvi:f:")) != -1) {
+        switch (c)
+        {
+            case 'f':
+                tmp = snprintf(compilepath, 512, "%s", optarg);
+                if (tmp > 0)
+                {
+                    compilepath[tmp] = '\0';
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    optind = 0;
+
+    while ((c = getopt (argc, argv, "W:w:t:s:l:aphvi:f:")) != -1) {
         switch (c)
         {
             case 'h':
@@ -162,8 +206,29 @@ int main(int argc, char** argv)
                 exit (EXIT_SUCCESS);
             case 'a':
                 ownprintf(TESTS"\n");
+
+                struct bstrList* l = dynbench_getall();
+                if (l)
+                {
+                    ownprintf("\nUser benchmarks:\n");
+                    for (i = 0; i < l->qty; i++)
+                    {
+                        if (dynbench_test(l->entry[i]))
+                        {
+                            TestCase* t = NULL;
+                            int err = dynbench_load(l->entry[i], &t, NULL, NULL, NULL);
+                            if (!err && t)
+                            {
+                                printf("%s - %s\n", t->name, t->desc);
+                                dynbench_close(t, NULL);
+                            }
+                        }
+                    }
+                    bstrListDestroy(l);
+                }
                 exit (EXIT_SUCCESS);
             case 'w':
+            case 'W':
                 numberOfWorkgroups++;
                 break;
             case 's':
@@ -180,15 +245,22 @@ int main(int argc, char** argv)
             case 'l':
                 bdestroy(testcase);
                 testcase = bfromcstr(optarg);
+                int builtin = 1;
                 for (i=0; i<NUMKERNELS; i++)
                 {
                     if (biseqcstr(testcase, kernels[i].name))
                     {
-                        test = kernels+i;
+                        test = (TestCase*)kernels+i;
                         break;
                     }
                 }
 
+                if (test == NULL && dynbench_test(testcase))
+                {
+                    dynbench_load(testcase, &test, NULL, NULL, NULL);
+                    builtin = 0;
+                }
+
                 if (test == NULL)
                 {
                     fprintf (stderr, "Error: Unknown test case %s\n",optarg);
@@ -255,6 +327,10 @@ int main(int argc, char** argv)
                     }
                 }
                 bdestroy(testcase);
+                if (!builtin)
+                {
+                    dynbench_close(test, NULL);
+                }
                 exit (EXIT_SUCCESS);
 
                 break;
@@ -275,11 +351,21 @@ int main(int argc, char** argv)
                 {
                     if (biseqcstr(testcase, kernels[i].name))
                     {
-                        test = kernels+i;
+                        test = (TestCase*)kernels+i;
                         break;
                     }
                 }
 
+                if (test == NULL && dynbench_test(testcase))
+                {
+                    if (strlen(compilepath) == 0)
+                    {
+                        int ret = snprintf(compilepath, 512, "%s", defcompilepath);
+                        if (ret > 0) compilepath[ret] = '\0';
+                    }
+                    dynbench_load(testcase, &test, compilepath, compilers, compileflags);
+                }
+
                 if (test == NULL)
                 {
                     fprintf (stderr, "Error: Unknown test case %s\n",optarg);
@@ -287,6 +373,8 @@ int main(int argc, char** argv)
                 }
                 bdestroy(testcase);
                 break;
+            case 'f':
+                break;
             case '?':
                 if (isprint (optopt))
                     fprintf (stderr, "Unknown option `-%c'.\n", optopt);
@@ -322,6 +410,11 @@ int main(int argc, char** argv)
     affinity_init();
     timer_init();
 
+    memset(&sig, 0, sizeof(struct sigaction));
+    sig.sa_sigaction = illhandler;
+    sig.sa_flags = SA_SIGINFO;
+    sigaction(SIGILL, &sig, NULL);
+
     if (optPrintDomains)
     {
         bdestroy(testcase);
@@ -343,21 +436,28 @@ int main(int argc, char** argv)
 
     allocator_init(numberOfWorkgroups * MAX_STREAMS);
     groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
+    memset(groups, 0, numberOfWorkgroups*sizeof(Workgroup));
     tmp = 0;
 
     optind = 0;
-    while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1)
+    while ((c = getopt (argc, argv, "W:w:t:s:l:i:aphv")) != -1)
     {
         switch (c)
         {
             case 'w':
+            case 'W':
                 currentWorkgroup = groups+tmp;
                 bstring groupstr = bfromcstr(optarg);
+                if (c == 'W')
+                {
+                    currentWorkgroup->init_per_thread = 1;
+                }
                 i = bstr_to_workgroup(currentWorkgroup, groupstr, test->type, test->streams);
                 bdestroy(groupstr);
                 size_t newsize = 0;
                 size_t stride = test->stride;
                 int nrThreads = currentWorkgroup->numberOfThreads;
+                int clsize = 128;
                 size_t orig_size = currentWorkgroup->size;
                 if (i == 0)
                 {
@@ -404,7 +504,8 @@ int main(int argc, char** argv)
                                                     currentWorkgroup->streams[i].offset,
                                                     test->type,
                                                     test->stride,
-                                                    currentWorkgroup->streams[i].domain);
+                                                    currentWorkgroup->streams[i].domain,
+                                                    currentWorkgroup->init_per_thread && nrThreads > 1);
                     }
                     tmp++;
                 }
@@ -413,16 +514,48 @@ int main(int argc, char** argv)
                     exit(EXIT_FAILURE);
                 }
                 if (newsize != currentWorkgroup->size)
+                {
                     currentWorkgroup->size = newsize;
+                }
+                if (nrThreads > 1)
+                {
+                    if (currentWorkgroup->init_per_thread)
+                    {
+                        printf("Initialization: Each thread in domain initializes its own stream chunks\n");
+                    }
+                    else
+                    {
+                        printf("Initialization: First thread in domain initializes the whole stream\n");
+                    }
+                }
                 break;
             default:
                 continue;
                 break;
         }
     }
+    if (numberOfWorkgroups > 1)
+    {
+        int g0_numberOfThreads = groups[0].numberOfThreads;
+        int g0_size = groups[0].size;
+        for (i = 1; i < numberOfWorkgroups; i++)
+        {
+            if (g0_numberOfThreads != groups[i].numberOfThreads)
+            {
+                fprintf (stderr, "Warning: Multiple workgroups with different thread counts are not recommended. Use with case!\n");
+                break;
+            }
+        }
+        for (i = 1; i < numberOfWorkgroups; i++)
+        {
+            if (g0_size != groups[i].size)
+            {
+                fprintf (stderr, "Warning: Multiple workgroups with different sizes are not recommended. Use with case!\n");
+                break;
+            }
+        }
+    }
 
-    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
-     * module only allows equally sized thread groups*/
     for (i=0; i<numberOfWorkgroups; i++)
     {
         globalNumberOfThreads += groups[i].numberOfThreads;
@@ -438,7 +571,7 @@ int main(int argc, char** argv)
 
 
     threads_init(globalNumberOfThreads);
-    threads_createGroups(numberOfWorkgroups);
+    threads_createGroups(numberOfWorkgroups, groups);
 
     /* we configure global barriers only */
     barrier_init(1);
@@ -468,6 +601,7 @@ int main(int argc, char** argv)
         myData.test = test;
         myData.cycles = 0;
         myData.numberOfThreads = groups[i].numberOfThreads;
+        myData.init_per_thread = groups[i].init_per_thread;
         myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
         myData.streams = (void**) malloc(test->streams * sizeof(void*));
 
@@ -555,8 +689,8 @@ int main(int argc, char** argv)
     ownprintf("Iterations:\t\t%" PRIu64 "\n", realIter);
     ownprintf("Iterations per thread:\t%" PRIu64 "\n",iters_per_thread);
     ownprintf("Inner loop executions:\t%d\n", (int)(((double)realSize)/((double)test->stride*globalNumberOfThreads)));
-    ownprintf("Size (Byte):\t\t%" PRIu64 "\n",  realSize * test->bytes );
-    ownprintf("Size per thread:\t%" PRIu64 "\n", size_per_thread * test->bytes);
+    ownprintf("Size (Byte):\t\t%" PRIu64 "\n",  realSize * datatypesize * test->streams);
+    ownprintf("Size per thread:\t%" PRIu64 "\n", size_per_thread * datatypesize * test->streams);
     ownprintf("Number of Flops:\t%" PRIu64 "\n", (iters_per_thread * realSize *  test->flops));
     ownprintf("MFlops/s:\t\t%.2f\n",
             1.0E-06 * ((double) (iters_per_thread * realSize *  test->flops) /  time));
@@ -637,7 +771,11 @@ int main(int argc, char** argv)
     LIKWID_MARKER_CLOSE;
 #endif
 
+    if (test->dlhandle != NULL)
+    {
+        dynbench_close(test, compilepath);
+    }
+
     bdestroy(HLINE);
     return EXIT_SUCCESS;
 }
-
diff --git a/bench/perl/AsmGen.pl b/bench/perl/AsmGen.pl
index fb47a4239..f2387da9e 100755
--- a/bench/perl/AsmGen.pl
+++ b/bench/perl/AsmGen.pl
@@ -198,7 +198,7 @@ END
 | LOOP SYMBOL INUMBER SREG block
 {[
 {FUNC => 'as::loop_entry',
- ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]},
  $item{block},
 {FUNC => 'as::loop_exit',
  ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
diff --git a/bench/perl/gas.pm b/bench/perl/gas.pm
index 1a74c7c90..125695b6c 100644
--- a/bench/perl/gas.pm
+++ b/bench/perl/gas.pm
@@ -1,11 +1,51 @@
-#!/usr/bin/env perl 
+#!/usr/bin/perl
+# =======================================================================================
+#
+#      Filename:  gas.pm
+#
+#      Description:  Implements gas callbacks for likwid asm parser.
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:  Jan Treibig (jt), jan.treibig@gmail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
 
 package as;
 use Data::Dumper;
 use isax86;
 use isax86_64;
-use isaarmv7;
-use isaarmv8;
+use isappc64;
+
+sub init
+{
+    if ($main::ISA eq 'x86') {
+        $AS = { HEADER     => '.intel_syntax noprefix',
+                FOOTER     => ''};
+    } elsif ($main::ISA eq 'x86_64') {
+        $AS = { HEADER     => '.intel_syntax noprefix',
+                FOOTER     => ''};
+    } elsif ($main::ISA eq 'ppc64') {
+        $AS = { HEADER     => '',
+                FOOTER     => ''};
+    }
+}
 
 $LOCAL = {};
 $MODE = 'GLOBAL';
@@ -16,267 +56,261 @@ my $STACKPTR;
 my $BASEPTR;
 my $REG;
 my $ARG;
+my $ALIGN='64';
 
 sub emit_code
 {
-	my $code = shift;
-	$code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
-	$code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
-	$code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
-	print "$code\n";
+    my $code = shift;
+    $code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
+    $code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
+    $code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
+    print "$code\n";
 }
 
 sub align
 {
-	my $number = shift;
-	print ".align $number\n";
+    my $number = shift;
+    print ".align $number\n";
 
 }
 
 sub mode
 {
-	$cmd = shift;
+    $cmd = shift;
 
-	if ($cmd eq 'START') {
-		$MODE = 'LOCAL';
-	} elsif ($cmd eq 'STOP') {
-		$MODE = 'GLOBAL';
-	}
+    if ($cmd eq 'START') {
+        $MODE = 'LOCAL';
+    } elsif ($cmd eq 'STOP') {
+        $MODE = 'GLOBAL';
+    }
 }
 
 sub function_entry
 {
-	my $symbolname = shift;
-	my $allocate = shift;
-	my $distance;
+    my $symbolname = shift;
+    my $allocate = shift;
+    my $distance;
 
-	foreach ( (0 .. $allocate) ) {
-		$distance =  $_ * $WORDLENGTH;
-		$LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
-	}
+    foreach ( (0 .. $allocate) ) {
+        $distance =  $_ * $WORDLENGTH;
+        $LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
+    }
 
-	if($CURRENT_SECTION ne 'text') {
-		$CURRENT_SECTION = 'text';
-		print ".text\n";
-	}
+    if($CURRENT_SECTION ne 'text') {
+        $CURRENT_SECTION = 'text';
+        print ".text\n";
+    }
 
-	print ".global $symbolname\n";
+    if ($main::ISA eq 'x86') {
+        print ".globl $symbolname\n";
+        print ".type $symbolname, \@function\n";
+        print "$symbolname :\n";
+        print "push ebp\n";
+        print "mov ebp, esp\n";
+        $distance = $allocate * $WORDLENGTH;
+        print "sub  esp, $distance\n" if ($allocate);
+        print "push ebx\n";
+        print "push esi\n";
+        print "push edi\n";
+    } elsif ($main::ISA eq 'x86-64') {
+        print ".globl $symbolname\n";
+        print ".type $symbolname, \@function\n";
+        print "$symbolname :\n";
+        print "push rbp\n";
+        print "mov rbp, rsp\n";
+        $distance = $allocate * $WORDLENGTH;
+        print "sub  rsp, $distance\n" if ($allocate);
+        print "push rbx\n";
+        print "push r12\n";
+        print "push r13\n";
+        print "push r14\n";
+        print "push r15\n";
+    } elsif ($main::ISA eq 'ppc64') {
+        #if ($main::ISA eq 'ppc64') {
+            print ".set r0,0; .set SP,1; .set RTOC,2; .set r3,3; .set r4,4;\n";
+            print ".set r5,5; .set r6,6; .set r7,7; .set r8,8; .set r9,9; .set r10,10\n";
+            print ".set x0,0; .set x1,1; .set x2,2; .set x3,3; .set x4,4\n";
+            print ".set x5,5; .set x6,6; .set x7,7; .set x8,8; .set x9,9;\n";
+	    print ".set vec0,0; .set vec1,1; .set vec2,2; .set vec3,3;\n";
+	    print ".set vec4,4; .set vec5,5; .set vec6,6; .set vec7,7;\n";
+	    print ".set vec8,8; .set vec9,9; .set vec10,10; .set vec11,11;\n";
+	    print ".set vec12,12;\n";
+            #}
+        print ".abiversion 2\n";
+        print ".section    \".toc\",\"aw\"\n";
+        print ".section    \".text\"\n";
+        print ".align 2\n";
+        print ".globl $symbolname\n";
+        print ".type $symbolname, \@function\n";
+        print "$symbolname :\n";
+        print ".L.$symbolname:\n";
+        print ".localentry $symbolname, .-$symbolname\n";
 
-	if ($main::ISA eq 'ARMv7' or $main::ISA eq 'ARMv8') {
-		print ".type $symbolname, %function\n";
-	} else {
-		print ".type $symbolname, \@function\n";
-	}
-	print "$symbolname :\n";
-
-	if ($main::ISA eq 'x86') {
-		print "push ebp\n";
-		print "mov ebp, esp\n";
-		$distance = $allocate * $WORDLENGTH;
-		print "sub  esp, $distance\n" if ($allocate);
-		print "push ebx\n";
-		print "push esi\n";
-		print "push edi\n";
-	} elsif ($main::ISA eq 'x86-64') {
-		print "push rbp\n";
-		print "mov rbp, rsp\n";
-		$distance = $allocate * $WORDLENGTH;
-		print "sub  rsp, $distance\n" if ($allocate);
-		print "push rbx\n";
-		print "push r12\n";
-		print "push r13\n";
-		print "push r14\n";
-		print "push r15\n";
-	} elsif ($main::ISA eq 'ARMv7') {
-		print "push     {r4-r7, lr}\n";
-		print "add      r7, sp, #12\n";
-		print "push     {r8, r10, r11}\n";
-		print "vstmdb   sp!, {d8-d15}\n";
-	}
+    }
 }
 
 sub function_exit
 {
-	my $symbolname = shift;
+    my $symbolname = shift;
 
-	$LOCAL = {};
+    $LOCAL = {};
 
-	if ($main::ISA eq 'x86') {
-		print "pop edi\n";
-		print "pop esi\n";
-		print "pop ebx\n";
-		print "mov  esp, ebp\n";
-		print "pop ebp\n";
-		print "ret\n";
-	} elsif ($main::ISA eq 'x86-64') {
-		print "pop r15\n";
-		print "pop r14\n";
-		print "pop r13\n";
-		print "pop r12\n";
-		print "pop rbx\n";
-		print "mov  rsp, rbp\n";
-		print "pop rbp\n";
-		print "ret\n";
-	} elsif ($main::ISA eq 'ARMv7') {
-		print "vldmia   sp!, {d8-d15}\n";
-		print "pop      {r8, r10, r11}\n";
-		print "pop      {r4-r7, pc}\n";
-	} elsif ($main::ISA eq 'ARMv8') {
-		print ".exit:\n";
-		print "\tret\n";
-	}
-	print ".size $symbolname, .-$symbolname\n";
-	print "\n";
+    if ($main::ISA eq 'x86') {
+        print "pop edi\n";
+        print "pop esi\n";
+        print "pop ebx\n";
+        print "mov  esp, ebp\n";
+        print "pop ebp\n";
+        print "ret\n";
+        print ".size $symbolname, .-$symbolname\n";
+    } elsif ($main::ISA eq 'x86-64') {
+        print "pop r15\n";
+        print "pop r14\n";
+        print "pop r13\n";
+        print "pop r12\n";
+        print "pop rbx\n";
+        print "mov  rsp, rbp\n";
+        print "pop rbp\n";
+        print "ret\n";
+        print ".size $symbolname, .-$symbolname\n";
+    } elsif ($main::ISA eq 'ppc64') {
+        print "blr\n";
+        print ".size $symbolname, .-$symbolname\n";
+    }
+    #print ".size $symbolname, .-$symbolname\n";
+    print "\n";
 }
 
 sub define_data
 {
-	my $symbolname = shift;
-	my $type = shift;
-	my $value = shift;
+    my $symbolname = shift;
+    my $type = shift;
+    my $value = shift;
 
-	if($CURRENT_SECTION ne 'data') {
-		$CURRENT_SECTION = 'data';
-		print ".data\n";
-	}
-	if  ($main::ISA ne 'ARMv7' and $main::ISA ne 'ARMv8') {
-		print ".align 64\n";
-		print "$symbolname:\n";
-		if ($type eq 'DOUBLE') {
-			print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
-		} elsif ($type eq 'SINGLE') {
-			print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
-		} elsif ($type eq 'INT') {
-			print ".int $value, $value\n"
-		}
-	}
+    if($CURRENT_SECTION ne 'data') {
+        $CURRENT_SECTION = 'data';
+        print ".data\n";
+    }
+    print ".align $ALIGN\n";
+    print "$symbolname:\n";
+    if ($type eq 'DOUBLE') {
+        print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
+    } elsif ($type eq 'SINGLE') {
+        print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
+    } elsif ($type eq 'INT') {
+        print ".int $value, $value\n"
+    }
 }
 
 sub define_offset
 {
-	my $symbolname = shift;
-	my $type = shift;
-	my $value = shift;
+    my $symbolname = shift;
+    my $type = shift;
+    my $value = shift;
 
-	if($CURRENT_SECTION ne 'data') {
-		$CURRENT_SECTION = 'data';
-		print ".data\n";
-	}
-	if ($main::ISA eq 'ARMv7' or $main::ISA eq 'ARMv8') {
-		print ".align 2\n";
-	} else {
-		print ".align 16\n";
-	}
-	print "$symbolname:\n";
-  print ".int $value\n";
+    if($CURRENT_SECTION ne 'data') {
+        $CURRENT_SECTION = 'data';
+        print ".data\n";
+    }
+    print ".align $ALIGN\n";
+    print "$symbolname:\n";
+    print ".int $value\n";
 }
 
 
 sub loop_entry
 {
-  my $symbolname = shift;
-  my $stopping_criterion = shift;
-  $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
+    my $symbolname = shift;
+    #my $stopping_criterion = shift;
+    my $step = shift;
 
-  if ($main::ISA eq 'x86') {
-    print "xor   eax, eax\n";
-  } elsif ($main::ISA eq 'x86-64') {
-    print "xor   rax, rax\n";
-  } elsif ($main::ISA eq 'ARMv7') {
-    print "mov   r4, #0\n";
-  } elsif ($main::ISA eq 'ARMv8') {
-    print "\tmov   x6, 0\n";
-    print ".loop:\n";
-    #print "\tcmp w0, w6\n";
-    #print "\tblt .exit\n";
-  }
-  if ($main::ISA eq 'ARMv7') {
-	  print ".align 2\n";
-  } elsif ($main::ISA eq 'ARMv8') {
-	  print "\n";
-  } else {
-	  print ".align 16\n";
-  }
-  if ($MODE eq 'GLOBAL') {
-    print "$symbolname :\n";
-  } elsif ($main::ISA ne 'ARMv8') {
-    print "1:\n";
-  }
+    if ($main::ISA eq 'x86') {
+        print "xor   eax, eax\n";
+        print ".align $ALIGN\n";
+        if ($MODE eq 'GLOBAL') {
+            print "$symbolname :\n";
+        } else {
+            print "1:\n";
+        }
+    } elsif ($main::ISA eq 'x86-64') {
+        print "xor   rax, rax\n";
+        print ".align $ALIGN\n";
+        if ($MODE eq 'GLOBAL') {
+            print "$symbolname :\n";
+        } else {
+            print "1:\n";
+        }
+    } elsif ($main::ISA eq 'ppc64') {
+	print "li r0, r10\n";
+        print "li r10, $step\n";
+	print "divd r10, r3, r10\n";
+	print "mtctr r10\n";
+	print "li r10, r0\n";
+        print "$symbolname:\n";
+    }
 
 }
 
 
 sub loop_exit
 {
-  my $symbolname = shift;
-  my $step = shift;
+    my $symbolname = shift;
+    my $step = shift;
 
-  if ($main::ISA eq 'x86') {
-    print "add eax, $step\n";
-    print "cmp eax, edi\n";
-  } elsif ($main::ISA eq 'x86-64') {
-    print "add rax, $step\n";
-    print "cmp rax, rdi\n";
-  } elsif ($main::ISA eq 'ARMv7') {
-    print "add r4, #$step\n";
-    print "cmp r4, r0\n";
-  } elsif ($main::ISA eq 'ARMv8') {
-    print "\tadd x6, x6, #$step\n";
-    print "\tcmp x6, x0\n";
-    print "\tblt .loop\n";
-  }
-  if ($MODE eq 'GLOBAL') {
-    print "jl $symbolname\n";
-  }else {
-	  if ($main::ISA eq 'ARMv7') {
-		  print "blt 1b\n";
-	  } elsif ($main::ISA eq 'ARMv8') {
-	  	#print "bgt 1b\n";
-	  	print "\n";
-	  } else {
-		  print "jl 1b\n";
-	  }
-  }
-  print "\n";
+    if ($main::ISA eq 'x86') {
+        print "add eax, $step\n";
+        print "cmp eax, edi\n";
+        if ($MODE eq 'GLOBAL') {
+            print "jl $symbolname\n";
+        } else {
+            print "jl 1b\n";
+        }
+        print "\n";
+    } elsif ($main::ISA eq 'x86-64') {
+        print "add rax, $step\n";
+        print "cmp rax, rdi\n";
+        if ($MODE eq 'GLOBAL') {
+            print "jl $symbolname\n";
+        } else {
+            print "jl 1b\n";
+        }
+        print "\n";
+    } elsif ($main::ISA eq 'ppc64') {
+        print "bdnz $symbolname\n";
+    }
 }
 
 sub isa_init
 {
-  if ($main::ISA eq 'x86') {
-    $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
-    $STACKPTR = $isax86::STACKPTR_X86 ;
-    $BASEPTR = $isax86::BASEPTR_X86 ;
-    $REG = $isax86::REG_X86;
-    $ARG = $isax86::ARG_X86 ;
-    $AS = { HEADER     => '.intel_syntax noprefix',
-	    FOOTER     => '' };
-  } elsif ($main::ISA eq 'x86-64') {
-    $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
-    $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
-    $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
-    $REG = $isax86_64::REG_X86_64;
-    $ARG = $isax86_64::ARG_X86_64 ;
-    $AS = { HEADER     => '.intel_syntax noprefix',
-	    FOOTER     => '' };
-  } elsif ($main::ISA eq 'ARMv7') {
-    $BASEPTR = $isaarmv7::BASEPTR_ARMv7;
-    $WORDLENGTH = $isaarmv7::WORDLENGTH_ARMv7;
-    $STACKPTR = $isaarmv7::STACKPTR_ARMv7 ;
-    $REG = $isaarmv7::REG_ARMv7;
-    $ARG = $isaarmv7::ARG_ARMv7 ;
-    $AS = { HEADER     => ".cpu    cortex-a15\n.fpu    neon-vfpv4",
-            FOOTER     => '' };
-  } elsif ($main::ISA eq 'ARMv8') {
-    $BASEPTR = $isaarmv8::BASEPTR_ARMv8;
-    $WORDLENGTH = $isaarmv8::WORDLENGTH_ARMv8;
-    $STACKPTR = $isaarmv8::STACKPTR_ARMv8 ;
-    $REG = $isaarmv8::REG_ARMv8;
-    $ARG = $isaarmv8::ARG_ARMv8 ;
-    $AS = { HEADER     => ".cpu    generic+fp+simd",
-            FOOTER     => '' };
-    
-  }
-
+    if ($main::ISA eq 'x86') {
+        $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
+        $STACKPTR = $isax86::STACKPTR_X86 ;
+        $BASEPTR = $isax86::BASEPTR_X86 ;
+        $REG = $isax86::REG_X86;
+        $ARG = $isax86::ARG_X86 ;
+        $AS = { HEADER     => '.intel_syntax noprefix',
+                FOOTER     => ''};
+        $ALIGN = '64';
+    } elsif ($main::ISA eq 'x86-64') {
+        $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
+        $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
+        $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
+        $REG = $isax86_64::REG_X86_64;
+        $ARG = $isax86_64::ARG_X86_64 ;
+        $AS = { HEADER     => '.intel_syntax noprefix',
+                FOOTER     => ''};
+        $ALIGN = '64';
+    } elsif ($main::ISA eq 'ppc64') {
+        $WORDLENGTH = $isappc64::WORDLENGTH_PPC64;
+        $STACKPTR = $isappc64::STACKPTR_PPC64 ;
+        $BASEPTR = $isappc64::BASEPTR_PPC64 ;
+        $REG = $isappc64::REG_PPC64;
+        $ARG = $isappc64::ARG_PPC64 ;
+        $AS = { HEADER     => '',
+                FOOTER     => ''};
+        $ALIGN = '16';
+    }
 }
 
+
 1;
diff --git a/bench/perl/generatePas.pl b/bench/perl/generatePas.pl
index 81dde59b6..9ce2240a1 100755
--- a/bench/perl/generatePas.pl
+++ b/bench/perl/generatePas.pl
@@ -114,6 +114,7 @@
 
         $file =~ /([A-Za-z_0-9]+)\.ptt/;
         $name = $1;
+        if ($name =~ /^$/) { continue; }
 
         $isLoop = 0;
         $skip=0;
@@ -121,6 +122,7 @@
         $prolog='';
         $loop='';
         $desc='';
+        $streams=1;
         my $loads=-1;
         my $stores=-1;
         my $branches=-1;
diff --git a/bench/perl/isappc64.pm b/bench/perl/isappc64.pm
new file mode 100644
index 000000000..8bdca1234
--- /dev/null
+++ b/bench/perl/isappc64.pm
@@ -0,0 +1,89 @@
+#!/usr/bin/perl 
+
+package isappc64;
+
+$WORDLENGTH_PPC64 = 8;
+$STACKPTR_PPC64 = '1';
+$BASEPTR_PPC64  = '2';
+
+$REG_PPC64 = { GPR1 => '3',
+	     GPR2 => '4',
+		 GPR3 => '5',
+		 GPR4 => '6',
+		 GPR5 => '7',
+		 GPR6 => '8',
+		 GPR7 => '9',
+		 GPR8 => '10',
+		 GPR9 => '11',
+		 GPR10 => '12',
+		 GPR11 => '13',
+		 GPR12 => '14',
+		 GPR13 => '15',
+		 GPR14 => '16',
+		 GPR15 => '17',
+		 GPR16 => '18',
+		 GPR17 => '19',
+		 GPR18 => '20',
+		 GPR19 => '21',
+		 GPR20 => '22',
+		 GPR21 => '23',
+		 GPR22 => '24',
+		 GPR23 => '25',
+		 GPR24 => '26',
+		 GPR25 => '27',
+		 GPR26 => '28',
+		 GPR27 => '29',
+		 GPR28 => '30',
+		 GPR29 => '31',
+         FPR1 => '0',
+	     FPR2 => '1',
+		 FPR3 => '2',
+		 FPR4 => '3',
+		 FPR5 => '4',
+		 FPR6 => '5',
+		 FPR7 => '6',
+		 FPR8 => '7',
+		 FPR9 => '8',
+		 FPR10 => '9',
+		 FPR11 => '10',
+		 FPR12 => '11',
+		 FPR13 => '12',
+		 FPR14 => '13',
+		 FPR15 => '14',
+		 FPR16 => '15',
+		 FPR17 => '16',
+		 FPR18 => '17',
+		 FPR19 => '18',
+		 FPR20 => '19',
+		 FPR21 => '20',
+		 FPR22 => '21',
+		 FPR23 => '22',
+		 FPR24 => '23',
+		 FPR25 => '24',
+		 FPR26 => '25',
+		 FPR27 => '26',
+		 FPR28 => '27',
+		 FPR29 => '28',
+		 FPR30 => '29',
+		 FPR31 => '30',
+		 FPR32 => '31'};
+
+$ARG_PPC64 = { ARG1 => '3',
+	     ARG2 => '4',
+	     ARG3 => '5',
+	     ARG4 => '6',
+	     ARG5 => '7',
+	     ARG6 => '8',
+	     ARG7 => '9',
+	     ARG8 => '10',
+	     ARG9 => '[rbp+56]'};
+
+sub emit_code
+{
+	my $code = shift;
+	$code =~ s/([GF]PR[0-9]+)/$isa::REG->{$1}/g;
+	print "$code\n";
+}
+
+
+1;
diff --git a/bench/power/add_scalar4.ptt b/bench/power/add_scalar4.ptt
new file mode 100644
index 000000000..082d0ac45
--- /dev/null
+++ b/bench/power/add_scalar4.ptt
@@ -0,0 +1,36 @@
+STREAMS 3
+TYPE DOUBLE
+DESC Double-precision add, only scalar operations
+FLOPS 1
+BYTES 24
+LOADS 2
+STORES 1
+INSTR_LOOP 19
+INSTR_CONST 5
+UOPS 19
+
+LOOP 4
+
+lfd     1, 0(STR0)
+lfd     2, 0(STR1)
+fadd    0, 1, 2
+stfd    0, 0(STR2)
+
+lfd     4, 8(STR0)
+lfd     5, 8(STR1)
+fadd    3, 4, 5
+stfd    3, 8(STR2)
+
+lfd     7, 16(STR0)
+lfd     8, 16(STR1)
+fadd    6, 7, 8
+stfd    6, 16(STR2)
+
+lfd     10, 24(STR0)
+lfd     11, 24(STR1)
+fadd    9, 10, 11
+stfd    9, 24(STR2)
+
+addi STR0, STR0, 32
+addi STR1, STR1, 32
+addi STR2, STR2, 32
diff --git a/bench/power/add_vsx1.ptt b/bench/power/add_vsx1.ptt
new file mode 100644
index 000000000..0c5f35a36
--- /dev/null
+++ b/bench/power/add_vsx1.ptt
@@ -0,0 +1,13 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 1
+BYTES 24
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+
+LOOP 2
+lxvd2x x1, STR0, r7
+lxvd2x x2, STR1, r7
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r7
+addi r7, r7, 16
diff --git a/bench/power/add_vsx2.ptt b/bench/power/add_vsx2.ptt
new file mode 100644
index 000000000..9f3018814
--- /dev/null
+++ b/bench/power/add_vsx2.ptt
@@ -0,0 +1,20 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 1
+BYTES 24
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+
+LOOP 4
+lxvd2x x1, STR0, r7
+lxvd2x x2, STR1, r7
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r7
+addi r7, r7, 32
+
+lxvd2x x1, STR0, r8
+lxvd2x x2, STR1, r8
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r8
+addi r8, r8, 32
diff --git a/bench/power/add_vsx4.ptt b/bench/power/add_vsx4.ptt
new file mode 100644
index 000000000..e91f0971b
--- /dev/null
+++ b/bench/power/add_vsx4.ptt
@@ -0,0 +1,34 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 1
+BYTES 24
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+LOOP 8
+lxvd2x x1, STR0, r7
+lxvd2x x2, STR1, r7
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r7
+addi r7, r7, 64
+
+lxvd2x x3, STR0, r8
+lxvd2x x4, STR1, r8
+xvadddp x5, x3, x4
+stxvd2x x5, STR2, r8
+addi r8, r8, 64
+
+lxvd2x x1, STR0, r9
+lxvd2x x2, STR1, r9
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r9
+addi r9, r9, 64
+
+lxvd2x x1, STR0, r10
+lxvd2x x2, STR1, r10
+xvadddp x0, x1, x2
+stxvd2x x0, STR2, r10
+addi r10, r10, 64
diff --git a/bench/power/add_vsx4_mem.ptt b/bench/power/add_vsx4_mem.ptt
new file mode 100644
index 000000000..11ad990de
--- /dev/null
+++ b/bench/power/add_vsx4_mem.ptt
@@ -0,0 +1,38 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 1
+BYTES 24
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+LOOP 8
+lxvd2x x1, STR0, r7
+lxvd2x x2, STR1, r7
+xvadddp x0, x1, x2
+dcbz STR2, r7
+stxvd2x x0, STR2, r7
+addi r7, r7, 64
+
+lxvd2x x1, STR0, r8
+lxvd2x x2, STR1, r8
+xvadddp x0, x1, x2
+dcbz STR2, r8
+stxvd2x x0, STR2, r8
+addi r8, r8, 64
+
+lxvd2x x1, STR0, r9
+lxvd2x x2, STR1, r9
+xvadddp x0, x1, x2
+dcbz STR2, r9
+stxvd2x x0, STR2, r9
+addi r9, r9, 64
+
+lxvd2x x1, STR0, r10
+lxvd2x x2, STR1, r10
+xvadddp x0, x1, x2
+dcbz STR2, r10
+stxvd2x x0, STR2, r10
+addi r10, r10, 64
diff --git a/bench/power/copy_scalar4.ptt b/bench/power/copy_scalar4.ptt
new file mode 100644
index 000000000..6dd430add
--- /dev/null
+++ b/bench/power/copy_scalar4.ptt
@@ -0,0 +1,21 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16
+
+LOOP 4
+
+lfd     0, 0(STR0)
+stfd    0, 0(STR1)
+
+lfd     1, 8(STR0)
+stfd    1, 8(STR1)
+
+lfd     2, 16(STR0)
+stfd    2, 16(STR1)
+
+lfd     3, 24(STR0)
+stfd    3, 24(STR1)
+
+addi STR0, STR0, 32
+addi STR1, STR1, 32
diff --git a/bench/power/copy_vsx4.ptt b/bench/power/copy_vsx4.ptt
new file mode 100644
index 000000000..85d43dd02
--- /dev/null
+++ b/bench/power/copy_vsx4.ptt
@@ -0,0 +1,31 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 0
+BYTES 16 
+INSTR_LOOP 3
+INSTR_CONST 1
+UOPS 4
+LOADS 1
+STORES 1
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+LOOP 8
+lxvd2x x1, STR0, r7
+stxvd2x x0, STR1, r7
+addi r7, r7, 64
+
+lxvd2x x1, STR0, r8
+stxvd2x x0, STR1, r8
+addi r8, r8, 64
+
+lxvd2x x1, STR0, r9
+stxvd2x x0, STR1, r9
+addi r9, r9, 64
+
+lxvd2x x1, STR0, r10
+stxvd2x x0, STR1, r10
+addi r10, r10, 64
diff --git a/bench/power/daxpy_vsx4.ptt b/bench/power/daxpy_vsx4.ptt
new file mode 100644
index 000000000..49fa744ac
--- /dev/null
+++ b/bench/power/daxpy_vsx4.ptt
@@ -0,0 +1,46 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+
+.set vec0,0; .set vec1,1; .set vec2,2; .set vec3,3;
+.set vec4,4; .set vec5,5; .set vec6,6; .set vec7,7;
+.set vec8,8; .set vec9,9; .set vec10,10; .set vec11,11;
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubdp vec0, vec0, vec0    # set to zero
+xvsubdp vec1, vec1, vec1    # set to zero
+xvsubdp vec2, vec2, vec2    # set to zero
+xvsubdp vec3, vec3, vec3    # set to zero
+
+LOOP 8
+
+lxvx vec4, STR0, r7
+lxvx vec5, STR1, r7
+
+lxvx vec6, STR0, r8
+lxvx vec7, STR1, r8
+
+lxvx vec8, STR0, r9
+lxvx vec9, STR1, r9
+
+lxvx vec10, STR0, r10
+lxvx vec11, STR1, r10
+
+# NB: Not a real DAXPY but same instruction mix (we're multiplying x and y)
+xvmaddadp vec0, vec4, vec5
+xvmaddadp vec1, vec6, vec7
+xvmaddadp vec2, vec8, vec9
+xvmaddadp vec3, vec10, vec11
+
+stxvx vec0, STR0, r7
+stxvx vec1, STR0, r8
+stxvx vec2, STR0, r9
+stxvx vec3, STR0, r10
+
+addi STR0, STR0, 64
+addi STR1, STR1, 64
diff --git a/bench/power/ddot_kahan_scalar4_dp.ptt b/bench/power/ddot_kahan_scalar4_dp.ptt
new file mode 100644
index 000000000..2858e63c8
--- /dev/null
+++ b/bench/power/ddot_kahan_scalar4_dp.ptt
@@ -0,0 +1,85 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+
+li r10, 6           # sixteen (4x4) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+
+# set accumulation registers to zero
+fsub     0, 0, 0
+fsub     1, 1, 1
+fsub     2, 2, 2
+fsub     3, 3, 3
+fsub     4, 4, 4
+fsub     5, 5, 5
+
+# set c registers to zero
+fsub     6, 6, 6
+fsub     7, 7, 7
+fsub     8, 8, 8
+fsub     9, 9, 9
+fsub     10, 10, 10
+fsub     11, 11, 11
+
+LOOP 6
+# load A[i]
+lfd      12, 0(STR0)
+lfd      13, 8(STR0)
+lfd      14, 16(STR0)
+lfd      15, 24(STR0)
+lfd      16, 32(STR0)
+lfd      17, 40(STR0)
+
+# load B[i]
+lfd      18, 0(STR1)
+lfd      19, 8(STR1)
+lfd      20, 16(STR1)
+lfd      21, 24(STR1)
+lfd      22, 32(STR1)
+lfd      23, 40(STR1)
+
+# y = A[i]*B[i]-c
+# fmsub FRT,FRA,FRC,FRB --> FRT <- [(FRA)×(FRC)] - (FRB)
+fmsub   6, 12, 18, 6
+fmsub   7, 13, 19, 7
+fmsub   8, 14, 20, 8
+fmsub   9, 15, 21, 9
+fmsub   10, 16, 22, 10
+fmsub   11, 17, 23, 11
+
+# t = sum + y
+fadd     24, 0, 6
+fadd     25, 1, 7
+fadd     26, 2, 8
+fadd     27, 3, 9
+fadd     28, 4, 10 
+fadd     29, 5, 11
+
+# tmp = t - sum; we can writewrite registers holding 'sum'
+fsub     0, 24, 0
+fsub     1, 25, 1
+fsub     2, 26, 2
+fsub     3, 27, 3
+fsub     4, 28, 4
+fsub     5, 29, 5
+
+# c = tmp - y = (t - sum) - y; we can overwrite register holding 'y'
+fsub     6, 0, 6
+fsub     7, 1, 7
+fsub     8, 2, 8
+fsub     9, 3, 9
+fsub     10, 4, 10
+fsub     11, 5, 11 
+
+# sum = t
+fmr     0, 24
+fmr     1, 25
+fmr     2, 26
+fmr     3, 27
+fmr     4, 28
+fmr     5, 29
+
+addi STR0, STR0, 48
+addi STR1, STR1, 48
diff --git a/bench/power/ddot_kahan_scalar4_sp.ptt b/bench/power/ddot_kahan_scalar4_sp.ptt
new file mode 100644
index 000000000..68c1ec595
--- /dev/null
+++ b/bench/power/ddot_kahan_scalar4_sp.ptt
@@ -0,0 +1,85 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+
+li r10, 6           # sixteen (4x4) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+
+# set accumulation registers to zero
+fsubs     0, 0, 0
+fsubs     1, 1, 1
+fsubs     2, 2, 2
+fsubs     3, 3, 3
+fsubs     4, 4, 4
+fsubs     5, 5, 5
+
+# set c registers to zero
+fsubs     6, 6, 6
+fsubs     7, 7, 7
+fsubs     8, 8, 8
+fsubs     9, 9, 9
+fsubs     10, 10, 10
+fsubs     11, 11, 11
+
+LOOP 6
+# load A[i]
+lfs      12, 0(STR0)
+lfs      13, 4(STR0)
+lfs      14, 8(STR0)
+lfs      15, 12(STR0)
+lfs      16, 16(STR0)
+lfs      17, 20(STR0)
+
+# load B[i]
+lfs      18, 0(STR1)
+lfs      19, 4(STR1)
+lfs      20, 8(STR1)
+lfs      21, 12(STR1)
+lfs      22, 16(STR1)
+lfs      23, 20(STR1)
+
+# y = A[i]*B[i]-c
+# fmsub FRT,FRA,FRC,FRB --> FRT <- [(FRA)×(FRC)] - (FRB)
+fmsubs  6, 12, 18, 6
+fmsubs  7, 13, 19, 7
+fmsubs  8, 14, 20, 8
+fmsubs  9, 15, 21, 9
+fmsubs  10, 16, 22, 10
+fmsubs  11, 17, 23, 11
+
+# t = sum + y
+fadds     24, 0, 6
+fadds     25, 1, 7
+fadds     26, 2, 8
+fadds     27, 3, 9
+fadds     28, 4, 10 
+fadds     29, 5, 11
+
+# tmp = t - sum; we can writewrite registers holding 'sum'
+fsubs     0, 24, 0
+fsubs     1, 25, 1
+fsubs     2, 26, 2
+fsubs     3, 27, 3
+fsubs     4, 28, 4
+fsubs     5, 29, 5
+
+# c = tmp - y = (t - sum) - y; we can overwrite register holding 'y'
+fsubs     6, 0, 6
+fsubs     7, 1, 7
+fsubs     8, 2, 8
+fsubs     9, 3, 9
+fsubs     10, 4, 10
+fsubs     11, 5, 11 
+
+# sum = t
+fmr     0, 24
+fmr     1, 25
+fmr     2, 26
+fmr     3, 27
+fmr     4, 28
+fmr     5, 29
+
+addi STR0, STR0, 24
+addi STR1, STR1, 24
diff --git a/bench/power/ddot_kahan_vsx4_dp.ptt b/bench/power/ddot_kahan_vsx4_dp.ptt
new file mode 100644
index 000000000..d4b5c8e77
--- /dev/null
+++ b/bench/power/ddot_kahan_vsx4_dp.ptt
@@ -0,0 +1,91 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+
+li r10, 12           # sixteen (4x4) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r6, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r7, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r8, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r9, 48           # load immediate value of 48 into r10 (used as offset in addressing)
+li r10, 64          # load immediate value of 48 into r10 (used as offset in addressing)
+li 11, 80          # load immediate value of 48 into r10 (used as offset in addressing)
+
+# set accumulation registers to zero
+xvsubdp     0, 0, 0
+xvsubdp     1, 1, 1
+xvsubdp     2, 2, 2
+xvsubdp     3, 3, 3
+xvsubdp     4, 4, 4
+xvsubdp     5, 5, 5
+
+# set c registers to zero
+xvsubdp     6, 6, 6
+xvsubdp     7, 7, 7
+xvsubdp     8, 8, 8
+xvsubdp     9, 9, 9
+xvsubdp     10, 10, 10
+xvsubdp     11, 11, 11
+
+LOOP 12
+# load A[i]
+lxvd2x      12, STR0, r6
+lxvd2x      13, STR0, r7
+lxvd2x      14, STR0, r8
+lxvd2x      15, STR0, r9
+lxvd2x      16, STR0, r10
+lxvd2x      17, STR0, 11
+
+# load B[i]
+lxvd2x      18, STR1, r6
+lxvd2x      19, STR1, r7
+lxvd2x      20, STR1, r8
+lxvd2x      21, STR1, r9
+lxvd2x      22, STR1, r10
+lxvd2x      23, STR1, 11
+
+# y = A[i]*B[i]-c
+# xvmsubadp xt, xa, xb --> xt = xa*xb-xt
+xvmsubadp   6, 12, 18
+xvmsubadp   7, 13, 19
+xvmsubadp   8, 14, 20
+xvmsubadp   9, 15, 21
+xvmsubadp   10, 16, 22
+xvmsubadp   11, 17, 23
+
+# t = sum + y
+xvadddp     24, 0, 6
+xvadddp     25, 1, 7
+xvadddp     26, 2, 8
+xvadddp     27, 3, 9
+xvadddp     28, 4, 10 
+xvadddp     29, 5, 11
+
+# tmp = t - sum; we can writewrite registers holding 'sum'
+xvsubdp     0, 24, 0
+xvsubdp     1, 25, 1
+xvsubdp     2, 26, 2
+xvsubdp     3, 27, 3
+xvsubdp     4, 28, 4
+xvsubdp     5, 29, 5
+
+# c = tmp - y = (t - sum) - y; we can overwrite register holding 'y'
+xvsubdp     6, 0, 6
+xvsubdp     7, 1, 7
+xvsubdp     8, 2, 8
+xvsubdp     9, 3, 9
+xvsubdp     10, 4, 10
+xvsubdp     11, 5, 11 
+
+# sum = t
+xvmovdp     0, 24
+xvmovdp     1, 25
+xvmovdp     2, 26
+xvmovdp     3, 27
+xvmovdp     4, 28
+xvmovdp     5, 29
+
+addi STR0, STR0, 96
+addi STR1, STR1, 96
diff --git a/bench/power/ddot_kahan_vsx4_sp_reorder.ptt b/bench/power/ddot_kahan_vsx4_sp_reorder.ptt
new file mode 100644
index 000000000..3086fbf45
--- /dev/null
+++ b/bench/power/ddot_kahan_vsx4_sp_reorder.ptt
@@ -0,0 +1,88 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+
+li r10, 24           # sixteen (4x4) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r6, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r7, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r8, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r9, 48           # load immediate value of 48 into r10 (used as offset in addressing)
+li r10, 64          # load immediate value of 48 into r10 (used as offset in addressing)
+li 11, 80           # load immediate value of 48 into r10 (used as offset in addressing)
+
+# set accumulation registers to zero
+xvsubsp     0, 0, 0
+xvsubsp     1, 1, 1
+xvsubsp     2, 2, 2
+xvsubsp     3, 3, 3
+xvsubsp     4, 4, 4
+xvsubsp     5, 5, 5
+
+# set c registers to zero
+xvsubsp     6, 6, 6
+xvsubsp     7, 7, 7
+xvsubsp     8, 8, 8
+xvsubsp     9, 9, 9
+xvsubsp     10, 10, 10
+xvsubsp     11, 11, 11
+
+LOOP 24
+# load A[i]
+lxvd2x      12, STR0, r6
+lxvd2x      18, STR1, r6
+lxvd2x      13, STR0, r7
+lxvd2x      19, STR1, r7
+xvmsubasp   26, 14, 20
+xvmsubasp   27, 15, 21
+xvaddsp     34, 4, 28
+xvaddsp     35, 5, 29
+xvsubsp     36, 30, 0
+xvsubsp     37, 31, 1
+xvsubsp     8, 38, 26
+xvsubsp     9, 39, 27
+xvmovsp     4, 34
+xvmovsp     5, 35
+
+lxvd2x      14, STR0, r8
+lxvd2x      20, STR1, r8
+lxvd2x      15, STR0, r9
+lxvd2x      21, STR1, r9
+xvmsubasp   28, 16, 22
+xvmsubasp   29, 17, 23
+xvaddsp     30, 0, 24
+xvaddsp     31, 1, 25
+xvsubsp     38, 32, 2
+xvsubsp     39, 33, 3
+xvsubsp     10, 40, 28
+xvsubsp     11, 41, 29
+xvmovsp     0, 30
+xvmovsp     1, 31
+
+lxvd2x      16, STR0, r10
+lxvd2x      22, STR1, r10
+lxvd2x      17, STR0, 11
+lxvd2x      23, STR1, 11
+    addi STR0, STR0, 96
+    addi STR1, STR1, 96
+xvmsubasp   24, 12, 18
+xvmsubasp   25, 13, 19
+xvaddsp     32, 2, 26
+xvaddsp     33, 3, 27
+xvsubsp     40, 34, 4
+xvsubsp     41, 35, 5
+xvsubsp     6, 36, 24
+xvsubsp     7, 37, 25
+xvmovsp     2, 32
+xvmovsp     3, 33
+
+
+
+# tmp = t - sum; we can writewrite registers holding 'sum'
+
+# c = tmp - y = (t - sum) - y; we can overwrite register holding 'y'
+
+# sum = t
+
diff --git a/bench/power/ddot_scalar4.ptt b/bench/power/ddot_scalar4.ptt
new file mode 100644
index 000000000..e4dd2377f
--- /dev/null
+++ b/bench/power/ddot_scalar4.ptt
@@ -0,0 +1,30 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+
+fsub    0, 0, 0     # zero
+fsub    1, 1, 1     # zero
+fsub    2, 2, 2     # zero
+fsub    3, 3, 3     # zero
+
+LOOP 4
+
+lfd    4, 0(STR0)
+lfd    5, 0(STR1)
+fmadd  0, 4, 5, 0
+
+lfd    6, 8(STR0)
+lfd    7, 8(STR1)
+fmadd  1, 6, 7, 1
+
+lfd    8, 16(STR0)
+lfd    9, 16(STR1)
+fmadd  2, 8, 9, 2
+
+lfd    10, 24(STR0)
+lfd    11, 24(STR1)
+fmadd  3, 10, 11, 3
+
+addi STR0, STR0, 32
+addi STR1, STR1, 32
diff --git a/bench/power/ddot_scalar4_sp.ptt b/bench/power/ddot_scalar4_sp.ptt
new file mode 100644
index 000000000..c4a32b0e1
--- /dev/null
+++ b/bench/power/ddot_scalar4_sp.ptt
@@ -0,0 +1,33 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+
+li r10, 4           # four elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+fsubs   0, 0, 0     # zero
+fsubs   1, 1, 1     # zero
+fsubs   2, 2, 2     # zero
+fsubs   3, 3, 3     # zero
+
+LOOP 4
+
+lfs    4, 0(STR0)
+lfs    5, 0(STR1)
+fmadds 0, 4, 5, 0
+
+lfs    6, 4(STR0)
+lfs    7, 4(STR1)
+fmadds 1, 6, 7, 1
+
+lfs    8, 8(STR0)
+lfs    9, 8(STR1)
+fmadds 2, 8, 9, 2
+
+lfs    10, 12(STR0)
+lfs    11, 12(STR1)
+fmadds 3, 10, 11, 3
+
+addi STR0, STR0, 16
+addi STR1, STR1, 16
diff --git a/bench/power/ddot_vsx4.ptt b/bench/power/ddot_vsx4.ptt
new file mode 100644
index 000000000..3568157e7
--- /dev/null
+++ b/bench/power/ddot_vsx4.ptt
@@ -0,0 +1,35 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 2
+BYTES 16
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubdp 0, 0, 0    # set to zero
+xvsubdp 1, 1, 1    # set to zero
+xvsubdp 2, 2, 2    # set to zero
+xvsubdp 3, 3, 3    # set to zero
+
+LOOP 8
+
+lxvd2x 4, STR0, r7
+lxvd2x 5, STR1, r7
+xvmaddadp 0, 4, 5
+
+lxvd2x 6, STR0, r8
+lxvd2x 7, STR1, r8
+xvmaddadp 1, 6, 7
+
+lxvd2x 8, STR0, r9
+lxvd2x 9, STR1, r9
+xvmaddadp 2, 8, 9
+
+lxvd2x 10, STR0, r10
+lxvd2x 11, STR1, r10
+xvmaddadp 3, 10, 11
+
+addi STR0, STR0, 64
+addi STR1, STR1, 64
diff --git a/bench/power/ddot_vsx4_sp.ptt b/bench/power/ddot_vsx4_sp.ptt
new file mode 100644
index 000000000..4ca9a0261
--- /dev/null
+++ b/bench/power/ddot_vsx4_sp.ptt
@@ -0,0 +1,38 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+
+li r10, 16          # eight (4x2) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubsp 0, 0, 0    # set to zero
+xvsubsp 1, 1, 1    # set to zero
+xvsubsp 2, 2, 2    # set to zero
+xvsubsp 3, 3, 3    # set to zero
+
+LOOP 16
+
+lxvd2x 4, STR0, r7
+lxvd2x 5, STR1, r7
+xvmaddasp 0, 4, 5
+
+lxvd2x 6, STR0, r8
+lxvd2x 7, STR1, r8
+xvmaddasp 1, 6, 7
+
+lxvd2x 8, STR0, r9
+lxvd2x 9, STR1, r9
+xvmaddasp 2, 8, 9
+
+lxvd2x 10, STR0, r10
+lxvd2x 11, STR1, r10
+xvmaddasp 3, 10, 11
+
+addi STR0, STR0, 64
+addi STR1, STR1, 64
diff --git a/bench/power/ddot_vsx4_sp_new.ptt b/bench/power/ddot_vsx4_sp_new.ptt
new file mode 100644
index 000000000..290f9bdf4
--- /dev/null
+++ b/bench/power/ddot_vsx4_sp_new.ptt
@@ -0,0 +1,41 @@
+STREAMS 2
+TYPE SINGLE
+FLOPS 2
+BYTES 8
+
+li r10, 16          # eight (4x2) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubsp 0, 0, 0    # set to zero
+xvsubsp 1, 1, 1    # set to zero
+xvsubsp 2, 2, 2    # set to zero
+xvsubsp 3, 3, 3    # set to zero
+
+LOOP 16
+
+lxvd2x 4, STR0, r7
+lxvd2x 5, STR1, r7
+lxvd2x 6, STR0, r8
+lxvd2x 7, STR1, r8
+
+addi STR0, STR0, 64
+
+xvmaddasp 2, 8, 9
+xvmaddasp 3, 10, 11
+
+lxvd2x 8, STR0, r9
+lxvd2x 9, STR1, r9
+lxvd2x 10, STR0, r10
+lxvd2x 11, STR1, r10
+
+addi STR1, STR1, 64
+
+xvmaddasp 0, 4, 5
+xvmaddasp 1, 6, 7
+
+
diff --git a/bench/power/gs_fwd_scalar.ptt b/bench/power/gs_fwd_scalar.ptt
new file mode 100644
index 000000000..8c387caac
--- /dev/null
+++ b/bench/power/gs_fwd_scalar.ptt
@@ -0,0 +1,32 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 3
+BYTES 16
+
+.set vec0,0; .set vec1,1; .set vec2,2; .set vec3,3;
+.set vec4,4; .set vec5,5; .set vec6,6; .set vec7,7;
+.set vec8,8; .set vec9,9; .set vec10,10; .set vec11,11;
+
+mtctr r3            # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+
+xssubdp vec0, vec0, vec0    # set to zero
+xssubdp vec1, vec1, vec1    # set to zero
+xssubdp vec2, vec2, vec2    # set to zero
+xssubdp vec3, vec3, vec3    # set to zero
+xssubdp vec4, vec4, vec4    # set to zero
+xssubdp vec5, vec5, vec5    # set to zero
+xssubdp vec10, vec10, vec10 # set to zero
+
+LOOP 1
+
+lfd vec0, 0(STR0)
+lfd vec10, 0(STR1)
+
+fmadd vec0, vec1, vec10, vec0 # uses both loaded values and a constant
+fmadd vec0, vec3, vec4, vec0  # has to wait for vec0 from previous FMA, has to wait for vec0 from previous iteration's mul
+fmul vec4, vec0, vec5   # has to wait for vec0 from previous FMA
+stfd vec4, 0(STR1)
+
+addi STR0, STR0, 8
+addi STR1, STR1, 8
diff --git a/bench/power/gs_fwd_vsx4.ptt b/bench/power/gs_fwd_vsx4.ptt
new file mode 100644
index 000000000..0f45f2e62
--- /dev/null
+++ b/bench/power/gs_fwd_vsx4.ptt
@@ -0,0 +1,32 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 3
+BYTES 16
+
+.set vec0,0; .set vec1,1; .set vec2,2; .set vec3,3;
+.set vec4,4; .set vec5,5; .set vec6,6; .set vec7,7;
+.set vec8,8; .set vec9,9; .set vec10,10; .set vec11,11;
+
+mtctr r3            # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+
+xvsubdp vec0, vec0, vec0    # set to zero
+xvsubdp vec1, vec1, vec1    # set to zero
+xvsubdp vec2, vec2, vec2    # set to zero
+xvsubdp vec3, vec3, vec3    # set to zero
+xvsubdp vec4, vec4, vec4    # set to zero
+xvsubdp vec5, vec5, vec5    # set to zero
+xvsubdp vec10, vec10, vec10 # set to zero
+
+LOOP 1
+
+lxvx vec0, STR0, r7
+lxvx vec10, STR1, r7
+
+xsmaddadp vec0, vec1, vec10 # uses both loaded values and a constant
+xsmaddadp vec0, vec3, vec4  # has to wait for vec0 from previous FMA, has to wait for vec0 from previous iteration's mul
+xsmuldp vec4, vec0, vec5   # has to wait for vec0 from previous FMA
+stxvx vec4, STR1, r7
+
+addi STR0, STR0, 8
+addi STR1, STR1, 8
diff --git a/bench/power/gs_fwd_vsx4_alt.ptt b/bench/power/gs_fwd_vsx4_alt.ptt
new file mode 100644
index 000000000..06e74cc61
--- /dev/null
+++ b/bench/power/gs_fwd_vsx4_alt.ptt
@@ -0,0 +1,31 @@
+STREAMS 2
+TYPE DOUBLE
+FLOPS 3
+BYTES 16
+
+.set vec0,0; .set vec1,1; .set vec2,2; .set vec3,3;
+.set vec4,4; .set vec5,5; .set vec6,6; .set vec7,7;
+.set vec8,8; .set vec9,9; .set vec10,10; .set vec11,11;
+
+mtctr r3            # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+
+xvsubdp vec0, vec0, vec0    # set to zero
+xvsubdp vec1, vec1, vec1    # set to zero
+xvsubdp vec2, vec2, vec2    # set to zero
+xvsubdp vec3, vec3, vec3    # set to zero
+xvsubdp vec4, vec4, vec4    # set to zero
+xvsubdp vec5, vec5, vec5    # set to zero
+xvsubdp vec10, vec10, vec10 # set to zero
+
+LOOP 1
+
+lxvx vec0, STR0, r7
+lxvx vec10, STR1, r7
+
+xsmaddadp vec0, vec1, vec10 # uses both loaded values and a constant
+xsmaddadp vec0, vec3, vec4  # has to wait for vec0 from previous FMA, has to wait for vec0 from previous iteration's mul
+xsmuldp vec4, vec0, vec5   # has to wait for vec0 from previous FMA
+stxvx vec4, STR1, r7
+
+addi r7, r7, 8
diff --git a/bench/power/lfd_stfd_lat.ptt b/bench/power/lfd_stfd_lat.ptt
new file mode 100644
index 000000000..e5f53cb10
--- /dev/null
+++ b/bench/power/lfd_stfd_lat.ptt
@@ -0,0 +1,40 @@
+STREAMS 1
+TYPE DOUBLE
+DESC Double-precision add, only scalar operations
+FLOPS 1
+BYTES 24
+LOADS 2
+STORES 1
+INSTR_LOOP 19
+INSTR_CONST 5
+UOPS 19
+
+li r10, 8           # four elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+
+LOOP 8
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
+
+lfd     1, 0(STR0)
+stfd    1, 0(STR0)
diff --git a/bench/power/load_scalar4.ptt b/bench/power/load_scalar4.ptt
new file mode 100644
index 000000000..ab61079a3
--- /dev/null
+++ b/bench/power/load_scalar4.ptt
@@ -0,0 +1,42 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 1
+BYTES 8
+
+
+fsub 0, 0, 0
+fsub 1, 1, 1
+fsub 2, 2, 2
+fsub 3, 3, 3
+fsub 4, 4, 4
+fsub 5, 5, 5
+fsub 6, 6, 6
+fsub 7, 7, 7
+
+LOOP 8
+
+lfd     10, 0(STR0)
+fadd    0, 0, 10 
+
+lfd     11, 8(STR0)
+fadd    1, 1, 11
+
+lfd     12, 16(STR0)
+fadd    2, 2, 12
+
+lfd     13, 24(STR0)
+fadd    3, 3, 13
+
+lfd     14, 32(STR0)
+fadd    4, 4, 14
+
+lfd     15, 40(STR0)
+fadd    5, 5, 15
+
+lfd     16, 48(STR0)
+fadd    6, 6, 16
+
+lfd     17, 56(STR0)
+fadd    7, 7, 17
+
+addi STR0, STR0, 64
diff --git a/bench/power/load_vsx1.ptt b/bench/power/load_vsx1.ptt
new file mode 100644
index 000000000..311bca9ff
--- /dev/null
+++ b/bench/power/load_vsx1.ptt
@@ -0,0 +1,17 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 1
+BYTES 8
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubdp 0, 0, 0    # set to zero
+
+LOOP 2
+
+lxvd2x 10, STR0, r7
+xvadddp 0, 0, 10
+addi STR0, STR0, 16
diff --git a/bench/power/load_vsx4.ptt b/bench/power/load_vsx4.ptt
new file mode 100644
index 000000000..402045487
--- /dev/null
+++ b/bench/power/load_vsx4.ptt
@@ -0,0 +1,30 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 1
+BYTES 8
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubdp 0, 0, 0    # set to zero
+xvsubdp 1, 1, 1    # set to zero
+xvsubdp 2, 2, 2    # set to zero
+xvsubdp 3, 3, 3    # set to zero
+xvsubdp 4, 4, 4    # set to zero
+xvsubdp 5, 5, 5    # set to zero
+xvsubdp 6, 6, 6    # set to zero
+xvsubdp 7, 7, 7    # set to zero
+
+LOOP 8
+
+lxvd2x 10, STR0, r7
+xvadddp 0, 0, 10
+lxvd2x 11, STR0, r8
+xvadddp 1, 1, 11
+lxvd2x 12, STR0, r9
+xvadddp 2, 2, 12
+lxvd2x 13, STR0, r10
+xvadddp 3, 3, 13
+addi STR0, STR0, 64
diff --git a/bench/power/schoenauer_triad_scalar4.ptt b/bench/power/schoenauer_triad_scalar4.ptt
new file mode 100644
index 000000000..550f1a6d7
--- /dev/null
+++ b/bench/power/schoenauer_triad_scalar4.ptt
@@ -0,0 +1,39 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+
+li r10, 4           # four elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+
+LOOP 4
+
+lfd    0, 0(STR0)
+lfd    1, 0(STR1)
+lfd    2, 0(STR2)
+fmadd  0, 1, 2, 0
+stfd   0, 0(STR3)
+
+lfd    3, 8(STR0)
+lfd    4, 8(STR1)
+lfd    5, 8(STR2)
+fmadd  3, 4, 5, 3
+stfd   3, 8(STR3)
+
+lfd    6, 16(STR0)
+lfd    7, 16(STR1)
+lfd    8, 16(STR2)
+fmadd  6, 7, 8, 6
+stfd   6, 16(STR3)
+
+lfd    9, 24(STR0)
+lfd    10, 24(STR1)
+lfd    11, 24(STR2)
+fmadd  9, 10, 11, 9
+stfd   9, 24(STR3)
+
+addi STR0, STR0, 32
+addi STR1, STR1, 32
+addi STR2, STR2, 32
+addi STR3, STR3, 32
diff --git a/bench/power/schoenauer_triad_vsx4.ptt b/bench/power/schoenauer_triad_vsx4.ptt
new file mode 100644
index 000000000..c3fb99ee4
--- /dev/null
+++ b/bench/power/schoenauer_triad_vsx4.ptt
@@ -0,0 +1,43 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 32
+
+li r10, 8           # eight (4x2) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r8, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r9, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r10, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li 11, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+LOOP 8
+
+lxvd2x 0, STR0, r8
+lxvd2x 1, STR1, r8
+lxvd2x 2, STR2, r8
+xvmaddadp 0, 1, 2
+stxvd2x 0, STR3, r8
+
+lxvd2x 3, STR0, r9
+lxvd2x 4, STR1, r9
+lxvd2x 5, STR2, r9
+xvmaddadp 3, 4, 5
+stxvd2x 3, STR3, r9
+
+lxvd2x 6, STR0, r10
+lxvd2x 7, STR1, r10
+lxvd2x 8, STR2, r10
+xvmaddadp 6, 7, 8
+stxvd2x 6, STR3, r10
+
+lxvd2x 9, STR0, 11
+lxvd2x 10, STR1, 11
+lxvd2x 11, STR2, 11
+xvmaddadp 9, 10, 11
+stxvd2x 9, STR3, 11
+
+addi STR0, STR0, 64
+addi STR1, STR1, 64
+addi STR2, STR2, 64
+addi STR3, STR3, 64
diff --git a/bench/power/store_scalar4.ptt b/bench/power/store_scalar4.ptt
new file mode 100644
index 000000000..0ed749a6d
--- /dev/null
+++ b/bench/power/store_scalar4.ptt
@@ -0,0 +1,18 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+
+fsub    x0, x0, x0     # zero
+fsub    x1, x1, x1     # zero
+fsub    x2, x2, x2     # zero
+fsub    x3, x3, x3     # zero
+
+LOOP 4
+
+stfd    x0, 0(STR0)
+stfd    x1, 8(STR0)
+stfd    x2, 16(STR0)
+stfd    x3, 24(STR0)
+
+addi STR0, STR0, 32
diff --git a/bench/power/store_vsx4.ptt b/bench/power/store_vsx4.ptt
new file mode 100644
index 000000000..6f0786bf7
--- /dev/null
+++ b/bench/power/store_vsx4.ptt
@@ -0,0 +1,23 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 8
+
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+xvsubdp 0, 0, 0    # set to zero
+xvsubdp 1, 1, 1    # set to zero
+xvsubdp 2, 2, 2    # set to zero
+xvsubdp 3, 3, 3    # set to zero
+
+LOOP 8
+
+stxvd2x 0, STR0, r7
+stxvd2x 1, STR0, r8
+stxvd2x 2, STR0, r9
+stxvd2x 3, STR0, r10
+
+addi STR0, STR0, 64
diff --git a/bench/power/stream.ptt b/bench/power/stream.ptt
new file mode 100644
index 000000000..842166aab
--- /dev/null
+++ b/bench/power/stream.ptt
@@ -0,0 +1,12 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+ld  GPR10, SCALAR@got(2)
+lxvd2x  FPR2, 0, GPR10
+LOOP 2
+lxvd2x  FPR1, GPR9, STR0
+lxvd2x  FPR3, GPR9, STR1
+xvmaddmdp FPR1, FPR2, FPR3
+stxvd2x  FPR1, GPR9, STR2
+
diff --git a/bench/power/stream_triad_scalar4.ptt b/bench/power/stream_triad_scalar4.ptt
new file mode 100644
index 000000000..91ecf48a3
--- /dev/null
+++ b/bench/power/stream_triad_scalar4.ptt
@@ -0,0 +1,38 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+
+li r10, 4           # four elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+fsub    2, 2, 2     # zero
+fsub    5, 5, 5     # zero
+fsub    8, 8, 8     # zero
+fsub    11, 11, 11  # zero
+
+LOOP 4
+
+lfd    0, 0(STR0)
+lfd    1, 0(STR1)
+fmadd  0, 1, 2, 0
+stfd   0, 0(STR2)
+
+lfd    3, 8(STR0)
+lfd    4, 8(STR1)
+fmadd  3, 4, 5, 3
+stfd   3, 8(STR2)
+
+lfd    6, 16(STR0)
+lfd    7, 16(STR1)
+fmadd  6, 7, 8, 6
+stfd   6, 16(STR2)
+
+lfd    9, 24(STR0)
+lfd    10, 24(STR1)
+fmadd  9, 10, 11, 9
+stfd   9, 24(STR2)
+
+addi STR0, STR0, 32
+addi STR1, STR1, 32
+addi STR2, STR2, 32
diff --git a/bench/power/stream_triad_vsx4.ptt b/bench/power/stream_triad_vsx4.ptt
new file mode 100644
index 000000000..4d1b182c8
--- /dev/null
+++ b/bench/power/stream_triad_vsx4.ptt
@@ -0,0 +1,49 @@
+STREAMS 4
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+LOADS 2
+STORES 1
+INSTR_LOOP 19
+INSTR_CONST 13
+UOPS 19
+
+li r10, 8           # eight (4x2) elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r8, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r9, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r10, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li 11, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+
+xvsubdp 2, 2, 2    # set to zero
+xvsubdp 5, 5, 5    # set to zero
+xvsubdp 8, 8, 8    # set to zero
+xvsubdp 11, 11, 11 # set to zero
+
+LOOP 8
+
+lxvd2x 0, STR0, r8
+lxvd2x 1, STR1, r8
+xvmaddadp 0, 1, 2
+stxvd2x 0, STR2, r8
+
+lxvd2x 3, STR0, r9
+lxvd2x 4, STR1, r9
+xvmaddadp 3, 4, 5
+stxvd2x 3, STR2, r9
+
+lxvd2x 6, STR0, r10
+lxvd2x 7, STR1, r10
+xvmaddadp 6, 7, 8
+stxvd2x 6, STR2, r10
+
+lxvd2x 9, STR0, 11
+lxvd2x 10, STR1, 11
+xvmaddadp 9, 10, 11
+stxvd2x 9, STR2, 11
+
+addi STR0, STR0, 64
+addi STR1, STR1, 64
+addi STR2, STR2, 64
diff --git a/bench/power/update_vsx4.ptt b/bench/power/update_vsx4.ptt
new file mode 100644
index 000000000..b28ee9112
--- /dev/null
+++ b/bench/power/update_vsx4.ptt
@@ -0,0 +1,34 @@
+STREAMS 1
+TYPE DOUBLE
+FLOPS 0
+BYTES 16 
+INSTR_LOOP 3
+INSTR_CONST 1
+UOPS 4
+LOADS 1
+STORES 1
+
+li r10, 8           # two elements per loop iteration
+divd r10, r3, r10   # r3 holds loop limit, divide by elements per loop iteration
+mtctr r10           # move to count register
+li r7, 0            # load immediate value of 0 into r7 (used as offset in addressing)
+li r8, 16           # load immediate value of 16 into r8 (used as offset in addressing)
+li r9, 32           # load immediate value of 32 into r9 (used as offset in addressing)
+li r10, 48          # load immediate value of 48 into r10 (used as offset in addressing)
+
+LOOP 8
+lxvd2x x1, STR0, r7
+stxvd2x x1, STR0, r7
+addi r7, r7, 64
+
+lxvd2x x1, STR0, r8
+stxvd2x x1, STR0, r8
+addi r8, r8, 64
+
+lxvd2x x1, STR0, r9
+stxvd2x x1, STR0, r9
+addi r9, r9, 64
+
+lxvd2x x1, STR0, r10
+stxvd2x x1, STR0, r10
+addi r10, r10, 64
diff --git a/bench/src/allocator.c b/bench/src/allocator.c
index 3c3775585..8bec7ebc4 100644
--- a/bench/src/allocator.c
+++ b/bench/src/allocator.c
@@ -96,13 +96,15 @@ allocator_allocateVector(
         int offset,
         DataType type,
         int stride,
-        bstring domainString)
+        bstring domainString,
+        int init_per_thread)
 {
     int i;
     size_t bytesize = 0;
     const AffinityDomain* domain = NULL;
     int errorCode;
     int elements = 0;
+    affinity_init();
 
     size_t typesize = allocator_dataTypeLength(type);
     bytesize = (size+offset) * typesize;
@@ -161,48 +163,50 @@ allocator_allocateVector(
             offset,
             LLU_CAST elements);
 
-    switch ( type )
+    if (!init_per_thread)
     {
-        case INT:
-            {
-                int* sptr = (int*) (*ptr);
-                sptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
+        switch ( type )
+        {
+            case INT:
                 {
-                    sptr[i] = 1;
-                }
-                *ptr = (void*) sptr;
+                    int* sptr = (int*) (*ptr);
+                    sptr += offset;
 
-            }
-            break;
+                    for ( uint64_t i=0; i < size; i++ )
+                    {
+                        sptr[i] = 1;
+                    }
+                    *ptr = (void*) sptr;
 
-        case SINGLE:
-            {
-                float* sptr = (float*) (*ptr);
-                sptr += offset;
+                }
+                break;
 
-                for ( uint64_t i=0; i < size; i++ )
+            case SINGLE:
                 {
-                    sptr[i] = 1.0;
-                }
-                *ptr = (void*) sptr;
+                    float* sptr = (float*) (*ptr);
+                    sptr += offset;
 
-            }
-            break;
+                    for ( uint64_t i=0; i < size; i++ )
+                    {
+                        sptr[i] = 1.0;
+                    }
+                    *ptr = (void*) sptr;
 
-        case DOUBLE:
-            {
-                double* dptr = (double*) (*ptr);
-                dptr += offset;
+                }
+                break;
 
-                for ( uint64_t i=0; i < size; i++ )
+            case DOUBLE:
                 {
-                    dptr[i] = 1.0;
+                    double* dptr = (double*) (*ptr);
+                    dptr += offset;
+
+                    for ( uint64_t i=0; i < size; i++ )
+                    {
+                        dptr[i] = 1.0;
+                    }
+                    *ptr = (void*) dptr;
                 }
-                *ptr = (void*) dptr;
-            }
-            break;
+                break;
+        }
     }
 }
-
diff --git a/bench/src/barrier.c b/bench/src/barrier.c
index 018c9d1c6..bcda23a25 100644
--- a/bench/src/barrier.c
+++ b/bench/src/barrier.c
@@ -141,9 +141,14 @@ barrier_synchronize(BarrierData* barr)
         {
 #if defined(__arm__) || defined(__ARM_ARCH_8A)
             __asm__ ("nop");
-#else
+#endif
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
             __asm__ ("pause");
 #endif
+#ifdef _ARCH_PCC
+            __asm__ ("noop");
+#endif
+
         }
     }
 
diff --git a/bench/src/bench.c b/bench/src/bench.c
index 23bb4d512..438cbcc90 100644
--- a/bench/src/bench.c
+++ b/bench/src/bench.c
@@ -42,13 +42,15 @@
 #include <allocator.h>
 #include <threads.h>
 #include <barrier.h>
-#include <likwid.h>
+//#include <likwid.h>
+#include <likwid-marker.h>
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
 #define BARRIER   barrier_synchronize(&barr)
 
 #define EXECUTE(func)   \
+    LIKWID_MARKER_REGISTER("bench");  \
     BARRIER; \
     LIKWID_MARKER_START("bench");  \
     timer_start(&time); \
@@ -73,6 +75,7 @@ runTest(void* arg)
     size_t size;
     size_t vecsize;
     size_t i;
+    size_t j = 0;
     BarrierData barr;
     ThreadData* data;
     ThreadUserData* myData;
@@ -88,7 +91,7 @@ runTest(void* arg)
     /* Prepare ptrs for thread */
     vecsize = myData->size / data->numberOfThreads;
     size = myData->size / data->numberOfThreads;
-    
+
     size -= (size % myData->test->stride);
     myData->size = size;
     offset = data->threadId * size;
@@ -96,6 +99,17 @@ runTest(void* arg)
     if (size != vecsize && data->threadId == 0)
         printf("Sanitizing vector length to a multiple of the loop stride from %d elements (%d bytes) to %d elements (%d bytes)\n", vecsize, vecsize*myData->test->bytes, size, size*myData->test->bytes);
 
+    /* pin the thread */
+    likwid_pinThread(myData->processors[threadId]);
+    printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %zd\n",
+            data->groupId,
+            threadId,
+            data->globalThreadId,
+            affinity_threadGetProcessorId(),
+            LLU_CAST size,
+            offset);
+    BARRIER;
+
     switch ( myData->test->type )
     {
         case SINGLE:
@@ -105,6 +119,13 @@ runTest(void* arg)
                 {
                     sptr = (float*) myData->streams[i];
                     sptr +=  offset;
+                    if (myData->init_per_thread)
+                    {
+                        for (j = 0; j < vecsize; j++)
+                        {
+                            sptr[j] = 1.0;
+                        }
+                    }
                     myData->streams[i] = (float*) sptr;
                 }
             }
@@ -116,6 +137,13 @@ runTest(void* arg)
                 {
                     sptr = (int*) myData->streams[i];
                     sptr +=  offset;
+                    if (myData->init_per_thread)
+                    {
+                        for (j = 0; j < vecsize; j++)
+                        {
+                            sptr[j] = 1;
+                        }
+                    }
                     myData->streams[i] = (int*) sptr;
                 }
             }
@@ -127,21 +155,19 @@ runTest(void* arg)
                 {
                     dptr = (double*) myData->streams[i];
                     dptr +=  offset;
+                    if (myData->init_per_thread)
+                    {
+                        for (j = 0; j < vecsize; j++)
+                        {
+                            dptr[j] = 1.0;
+                        }
+                    }
                     myData->streams[i] = (double*) dptr;
                 }
             }
             break;
     }
 
-    /* pin the thread */
-    likwid_pinThread(myData->processors[threadId]);
-    printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %zd\n",
-            data->groupId,
-            threadId,
-            data->globalThreadId,
-            affinity_threadGetProcessorId(),
-            LLU_CAST size,
-            offset);
     BARRIER;
 
     /* Up to 10 streams the following registers are used for Array ptr:
@@ -473,7 +499,7 @@ getIterSingle(void* arg)
     //size = myData->size - (myData->size % myData->test->stride);
     vecsize = myData->size;
     size = myData->size / data->numberOfThreads;
-    
+
     size -= (size % myData->test->stride);
     offset = data->threadId * size;
 
@@ -771,4 +797,3 @@ getIterSingle(void* arg)
 #endif
     return NULL;
 }
-
diff --git a/bench/src/bstrlib.c b/bench/src/bstrlib.c
deleted file mode 100644
index 380269cd2..000000000
--- a/bench/src/bstrlib.c
+++ /dev/null
@@ -1,2955 +0,0 @@
-/*
- * =======================================================================================
- * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
- * on usage and license.
- */
-/*
- * bstrlib.c
- *
- * This file is the core module for implementing the bstring functions.
- */
-
-#include <stdio.h>
-#include <stddef.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include "bstrlib.h"
-
-/* Optionally include a mechanism for debugging memory */
-
-#if defined(MEMORY_DEBUG) || defined(BSTRLIB_MEMORY_DEBUG)
-#include "memdbg.h"
-#endif
-
-#ifndef bstr__alloc
-#define bstr__alloc(x) malloc (x)
-#endif
-
-#ifndef bstr__free
-#define bstr__free(p) free (p)
-#endif
-
-#ifndef bstr__realloc
-#define bstr__realloc(p,x) realloc ((p), (x))
-#endif
-
-#ifndef bstr__memcpy
-#define bstr__memcpy(d,s,l) memcpy ((d), (s), (l))
-#endif
-
-#ifndef bstr__memmove
-#define bstr__memmove(d,s,l) memmove ((d), (s), (l))
-#endif
-
-#ifndef bstr__memset
-#define bstr__memset(d,c,l) memset ((d), (c), (l))
-#endif
-
-#ifndef bstr__memcmp
-#define bstr__memcmp(d,c,l) memcmp ((d), (c), (l))
-#endif
-
-#ifndef bstr__memchr
-#define bstr__memchr(s,c,l) memchr ((s), (c), (l))
-#endif
-
-/* Just a length safe wrapper for memmove. */
-
-#define bBlockCopy(D,S,L) { if ((L) > 0) bstr__memmove ((D),(S),(L)); }
-
-/* Compute the snapped size for a given requested size.  By snapping to powers
-   of 2 like this, repeated reallocations are avoided. */
-static int snapUpSize (int i) {
-    if (i < 8) {
-        i = 8;
-    } else {
-        unsigned int j;
-        j = (unsigned int) i;
-
-        j |= (j >>  1);
-        j |= (j >>  2);
-        j |= (j >>  4);
-        j |= (j >>  8);        /* Ok, since int >= 16 bits */
-#if (UINT_MAX != 0xffff)
-        j |= (j >> 16);        /* For 32 bit int systems */
-#if (UINT_MAX > 0xffffffffUL)
-        j |= (j >> 32);        /* For 64 bit int systems */
-#endif
-#endif
-        /* Least power of two greater than i */
-        j++;
-        if ((int) j >= i) i = (int) j;
-    }
-    return i;
-}
-
-/*  int balloc (bstring b, int len)
- *
- *  Increase the size of the memory backing the bstring b to at least len.
- */
-int balloc (bstring b, int olen) {
-    int len;
-    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
-        b->mlen < b->slen || olen <= 0) {
-        return BSTR_ERR;
-    }
-
-    if (olen >= b->mlen) {
-        unsigned char * x;
-
-        if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
-
-        /* Assume probability of a non-moving realloc is 0.125 */
-        if (7 * b->mlen < 8 * b->slen) {
-
-            /* If slen is close to mlen in size then use realloc to reduce
-               the memory defragmentation */
-
-            reallocStrategy:;
-
-            x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-            if (x == NULL) {
-
-                /* Since we failed, try allocating the tighest possible 
-                   allocation */
-
-                if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
-                    return BSTR_ERR;
-                }
-            }
-        } else {
-
-            /* If slen is not close to mlen then avoid the penalty of copying
-               the extra bytes that are allocated, but not considered part of
-               the string */
-
-            if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
-
-                /* Perhaps there is no available memory for the two 
-                   allocations to be in memory at once */
-
-                goto reallocStrategy;
-
-            } else {
-                if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
-                bstr__free (b->data);
-            }
-        }
-        b->data = x;
-        b->mlen = len;
-        b->data[b->slen] = (unsigned char) '\0';
-    }
-
-    return BSTR_OK;
-}
-
-/*  int ballocmin (bstring b, int len)
- *
- *  Set the size of the memory backing the bstring b to len or b->slen+1,
- *  whichever is larger.  Note that repeated use of this function can degrade
- *  performance.
- */
-int ballocmin (bstring b, int len) {
-    unsigned char * s;
-
-    if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
-        b->mlen < b->slen || len <= 0) {
-        return BSTR_ERR;
-    }
-
-    if (len < b->slen + 1) len = b->slen + 1;
-
-    if (len != b->mlen) {
-        s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-        if (NULL == s) return BSTR_ERR;
-        s[b->slen] = (unsigned char) '\0';
-        b->data = s;
-        b->mlen = len;
-    }
-
-    return BSTR_OK;
-}
-
-/*  bstring bfromcstr (const char * str)
- *
- *  Create a bstring which contains the contents of the '\0' terminated char *
- *  buffer str.
- */
-bstring bfromcstr (const char * str) {
-bstring b;
-int i;
-size_t j;
-
-    if (str == NULL) return NULL;
-    j = (strlen) (str);
-    i = snapUpSize ((int) (j + (2 - (j != 0))));
-    if (i <= (int) j) return NULL;
-
-    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-    if (NULL == b) return NULL;
-    b->slen = (int) j;
-    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-        bstr__free (b);
-        return NULL;
-    }
-
-    bstr__memcpy (b->data, str, j+1);
-    return b;
-}
-
-/*  bstring bfromcstralloc (int mlen, const char * str)
- *
- *  Create a bstring which contains the contents of the '\0' terminated char *
- *  buffer str.  The memory buffer backing the string is at least len 
- *  characters in length.
- */
-bstring bfromcstralloc (int mlen, const char * str) {
-bstring b;
-int i;
-size_t j;
-
-    if (str == NULL) return NULL;
-    j = (strlen) (str);
-    i = snapUpSize ((int) (j + (2 - (j != 0))));
-    if (i <= (int) j) return NULL;
-
-    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-    if (b == NULL) return NULL;
-    b->slen = (int) j;
-    if (i < mlen) i = mlen;
-
-    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-        bstr__free (b);
-        return NULL;
-    }
-
-    bstr__memcpy (b->data, str, j+1);
-    return b;
-}
-
-/*  bstring blk2bstr (const void * blk, int len)
- *
- *  Create a bstring which contains the content of the block blk of length 
- *  len.
- */
-bstring blk2bstr (const void * blk, int len) {
-bstring b;
-int i;
-
-    if (blk == NULL || len < 0) return NULL;
-    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-    if (b == NULL) return NULL;
-    b->slen = len;
-
-    i = len + (2 - (len != 0));
-    i = snapUpSize (i);
-
-    b->mlen = i;
-
-    b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
-    if (b->data == NULL) {
-        bstr__free (b);
-        return NULL;
-    }
-
-    if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
-    b->data[len] = (unsigned char) '\0';
-
-    return b;
-}
-
-/*  char * bstr2cstr (const_bstring s, char z)
- *
- *  Create a '\0' terminated char * buffer which is equal to the contents of 
- *  the bstring s, except that any contained '\0' characters are converted 
- *  to the character in z. This returned value should be freed with a 
- *  bcstrfree () call, by the calling application.
- */
-char * bstr2cstr (const_bstring b, char z) {
-int i, l;
-char * r;
-
-    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
-    l = b->slen;
-    r = (char *) bstr__alloc ((size_t) (l + 1));
-    if (r == NULL) return r;
-
-    for (i=0; i < l; i ++) {
-        r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
-    }
-
-    r[l] = (unsigned char) '\0';
-
-    return r;
-}
-
-/*  int bcstrfree (char * s)
- *
- *  Frees a C-string generated by bstr2cstr ().  This is normally unnecessary
- *  since it just wraps a call to bstr__free (), however, if bstr__alloc () 
- *  and bstr__free () have been redefined as a macros within the bstrlib 
- *  module (via defining them in memdbg.h after defining 
- *  BSTRLIB_MEMORY_DEBUG) with some difference in behaviour from the std 
- *  library functions, then this allows a correct way of freeing the memory 
- *  that allows higher level code to be independent from these macro 
- *  redefinitions.
- */
-int bcstrfree (char * s) {
-    if (s) {
-        bstr__free (s);
-        return BSTR_OK;
-    }
-    return BSTR_ERR;
-}
-
-/*  int bconcat (bstring b0, const_bstring b1)
- *
- *  Concatenate the bstring b1 to the bstring b0.
- */
-int bconcat (bstring b0, const_bstring b1) {
-int len, d;
-bstring aux = (bstring) b1;
-
-    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
-
-    d = b0->slen;
-    len = b1->slen;
-    if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
-
-    if (b0->mlen <= d + len + 1) {
-        ptrdiff_t pd = b1->data - b0->data;
-        if (0 <= pd && pd < b0->mlen) {
-            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-        }
-        if (balloc (b0, d + len + 1) != BSTR_OK) {
-            if (aux != b1) bdestroy (aux);
-            return BSTR_ERR;
-        }
-    }
-
-    bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
-    b0->data[d + len] = (unsigned char) '\0';
-    b0->slen = d + len;
-    if (aux != b1) bdestroy (aux);
-    return BSTR_OK;
-}
-
-/*  int bconchar (bstring b, char c)
-/ *
- *  Concatenate the single character c to the bstring b.
- */
-int bconchar (bstring b, char c) {
-int d;
-
-    if (b == NULL) return BSTR_ERR;
-    d = b->slen;
-    if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-    b->data[d] = (unsigned char) c;
-    b->data[d + 1] = (unsigned char) '\0';
-    b->slen++;
-    return BSTR_OK;
-}
-
-/*  int bcatcstr (bstring b, const char * s)
- *
- *  Concatenate a char * string to a bstring.
- */
-int bcatcstr (bstring b, const char * s) {
-char * d;
-int i, l;
-
-    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-     || b->mlen <= 0 || s == NULL) return BSTR_ERR;
-
-    /* Optimistically concatenate directly */
-    l = b->mlen - b->slen;
-    d = (char *) &b->data[b->slen];
-    for (i=0; i < l; i++) {
-        if ((*d++ = *s++) == '\0') {
-            b->slen += i;
-            return BSTR_OK;
-        }
-    }
-    b->slen += i;
-
-    /* Need to explicitely resize and concatenate tail */
-    return bcatblk (b, (const void *) s, (int) strlen (s));
-}
-
-/*  int bcatblk (bstring b, const void * s, int len)
- *
- *  Concatenate a fixed length buffer to a bstring.
- */
-int bcatblk (bstring b, const void * s, int len) {
-int nl;
-
-    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-     || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
-
-    if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
-    if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
-
-    bBlockCopy (&b->data[b->slen], s, (size_t) len);
-    b->slen = nl;
-    b->data[nl] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  bstring bstrcpy (const_bstring b)
- *
- *  Create a copy of the bstring b.
- */
-bstring bstrcpy (const_bstring b) {
-bstring b0;
-int i,j;
-
-    /* Attempted to copy an invalid string? */
-    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
-
-    b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
-    if (b0 == NULL) {
-        /* Unable to allocate memory for string header */
-        return NULL;
-    }
-
-    i = b->slen;
-    j = snapUpSize (i + 1);
-
-    b0->data = (unsigned char *) bstr__alloc (j);
-    if (b0->data == NULL) {
-        j = i + 1;
-        b0->data = (unsigned char *) bstr__alloc (j);
-        if (b0->data == NULL) {
-            /* Unable to allocate memory for string data */
-            bstr__free (b0);
-            return NULL;
-        }
-    }
-
-    b0->mlen = j;
-    b0->slen = i;
-
-    if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
-    b0->data[b0->slen] = (unsigned char) '\0';
-
-    return b0;
-}
-
-/*  int bassign (bstring a, const_bstring b)
- *
- *  Overwrite the string a with the contents of string b.
- */
-int bassign (bstring a, const_bstring b) {
-    if (b == NULL || b->data == NULL || b->slen < 0)
-        return BSTR_ERR;
-    if (b->slen != 0) {
-        if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
-        bstr__memmove (a->data, b->data, b->slen);
-    } else {
-        if (a == NULL || a->data == NULL || a->mlen < a->slen || 
-            a->slen < 0 || a->mlen == 0) 
-            return BSTR_ERR;
-    }
-    a->data[b->slen] = (unsigned char) '\0';
-    a->slen = b->slen;
-    return BSTR_OK;
-}
-
-/*  int bassignmidstr (bstring a, const_bstring b, int left, int len)
- *
- *  Overwrite the string a with the middle of contents of string b 
- *  starting from position left and running for a length len.  left and 
- *  len are clamped to the ends of b as with the function bmidstr.
- */
-int bassignmidstr (bstring a, const_bstring b, int left, int len) {
-    if (b == NULL || b->data == NULL || b->slen < 0)
-        return BSTR_ERR;
-
-    if (left < 0) {
-        len += left;
-        left = 0;
-    }
-
-    if (len > b->slen - left) len = b->slen - left;
-
-    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-        a->slen < 0 || a->mlen == 0)
-        return BSTR_ERR;
-
-    if (len > 0) {
-        if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
-        bstr__memmove (a->data, b->data + left, len);
-        a->slen = len;
-    } else {
-        a->slen = 0;
-    }
-    a->data[a->slen] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  int bassigncstr (bstring a, const char * str)
- *
- *  Overwrite the string a with the contents of char * string str.  Note that 
- *  the bstring a must be a well defined and writable bstring.  If an error 
- *  occurs BSTR_ERR is returned however a may be partially overwritten.
- */
-int bassigncstr (bstring a, const char * str) {
-int i;
-size_t len;
-    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-        a->slen < 0 || a->mlen == 0 || NULL == str) 
-        return BSTR_ERR;
-
-    for (i=0; i < a->mlen; i++) {
-        if ('\0' == (a->data[i] = str[i])) {
-            a->slen = i;
-            return BSTR_OK;
-        }
-    }
-
-    a->slen = i;
-    len = strlen (str + i);
-    if (len > INT_MAX || i + len + 1 > INT_MAX ||
-        0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
-    bBlockCopy (a->data + i, str + i, (size_t) len + 1);
-    a->slen += (int) len;
-    return BSTR_OK;
-}
-
-/*  int bassignblk (bstring a, const void * s, int len)
- *
- *  Overwrite the string a with the contents of the block (s, len).  Note that 
- *  the bstring a must be a well defined and writable bstring.  If an error 
- *  occurs BSTR_ERR is returned and a is not overwritten.
- */
-int bassignblk (bstring a, const void * s, int len) {
-    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-        a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
-        return BSTR_ERR;
-    if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
-    bBlockCopy (a->data, s, (size_t) len);
-    a->data[len] = (unsigned char) '\0';
-    a->slen = len;
-    return BSTR_OK;
-}
-
-/*  int btrunc (bstring b, int n)
- *
- *  Truncate the bstring to at most n characters.
- */
-int btrunc (bstring b, int n) {
-    if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-    if (b->slen > n) {
-        b->slen = n;
-        b->data[n] = (unsigned char) '\0';
-    }
-    return BSTR_OK;
-}
-
-#define   upcase(c) (toupper ((unsigned char) c))
-#define downcase(c) (tolower ((unsigned char) c))
-#define   wspace(c) (isspace ((unsigned char) c))
-
-/*  int btoupper (bstring b)
- *
- *  Convert contents of bstring to upper case.
- */
-int btoupper (bstring b) {
-int i, len;
-    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-    for (i=0, len = b->slen; i < len; i++) {
-        b->data[i] = (unsigned char) upcase (b->data[i]);
-    }
-    return BSTR_OK;
-}
-
-/*  int btolower (bstring b)
- *
- *  Convert contents of bstring to lower case.
- */
-int btolower (bstring b) {
-int i, len;
-    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-    for (i=0, len = b->slen; i < len; i++) {
-        b->data[i] = (unsigned char) downcase (b->data[i]);
-    }
-    return BSTR_OK;
-}
-
-/*  int bstricmp (const_bstring b0, const_bstring b1)
- *
- *  Compare two strings without differentiating between case.  The return 
- *  value is the difference of the values of the characters where the two 
- *  strings first differ after lower case transformation, otherwise 0 is 
- *  returned indicating that the strings are equal.  If the lengths are 
- *  different, then a difference from 0 is given, but if the first extra 
- *  character is '\0', then it is taken to be the value UCHAR_MAX+1.
- */
-int bstricmp (const_bstring b0, const_bstring b1) {
-int i, v, n;
-
-    if (bdata (b0) == NULL || b0->slen < 0 || 
-        bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
-    if ((n = b0->slen) > b1->slen) n = b1->slen;
-    else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
-
-    for (i = 0; i < n; i ++) {
-        v  = (char) downcase (b0->data[i])
-           - (char) downcase (b1->data[i]);
-        if (0 != v) return v;
-    }
-
-    if (b0->slen > n) {
-        v = (char) downcase (b0->data[n]);
-        if (v) return v;
-        return UCHAR_MAX + 1;
-    }
-    if (b1->slen > n) {
-        v = - (char) downcase (b1->data[n]);
-        if (v) return v;
-        return - (int) (UCHAR_MAX + 1);
-    }
-    return BSTR_OK;
-}
-
-/*  int bstrnicmp (const_bstring b0, const_bstring b1, int n)
- *
- *  Compare two strings without differentiating between case for at most n
- *  characters.  If the position where the two strings first differ is
- *  before the nth position, the return value is the difference of the values
- *  of the characters, otherwise 0 is returned.  If the lengths are different
- *  and less than n characters, then a difference from 0 is given, but if the 
- *  first extra character is '\0', then it is taken to be the value 
- *  UCHAR_MAX+1.
- */
-int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
-int i, v, m;
-
-    if (bdata (b0) == NULL || b0->slen < 0 || 
-        bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
-    m = n;
-    if (m > b0->slen) m = b0->slen;
-    if (m > b1->slen) m = b1->slen;
-
-    if (b0->data != b1->data) {
-        for (i = 0; i < m; i ++) {
-            v  = (char) downcase (b0->data[i]);
-            v -= (char) downcase (b1->data[i]);
-            if (v != 0) return b0->data[i] - b1->data[i];
-        }
-    }
-
-    if (n == m || b0->slen == b1->slen) return BSTR_OK;
-
-    if (b0->slen > m) {
-        v = (char) downcase (b0->data[m]);
-        if (v) return v;
-        return UCHAR_MAX + 1;
-    }
-
-    v = - (char) downcase (b1->data[m]);
-    if (v) return v;
-    return - (int) (UCHAR_MAX + 1);
-}
-
-/*  int biseqcaseless (const_bstring b0, const_bstring b1)
- *
- *  Compare two strings for equality without differentiating between case.  
- *  If the strings differ other than in case, 0 is returned, if the strings 
- *  are the same, 1 is returned, if there is an error, -1 is returned.  If 
- *  the length of the strings are different, this function is O(1).  '\0' 
- *  termination characters are not treated in any special way.
- */
-int biseqcaseless (const_bstring b0, const_bstring b1) {
-int i, n;
-
-    if (bdata (b0) == NULL || b0->slen < 0 || 
-        bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
-    if (b0->slen != b1->slen) return BSTR_OK;
-    if (b0->data == b1->data || b0->slen == 0) return 1;
-    for (i=0, n=b0->slen; i < n; i++) {
-        if (b0->data[i] != b1->data[i]) {
-            unsigned char c = (unsigned char) downcase (b0->data[i]);
-            if (c != (unsigned char) downcase (b1->data[i])) return 0;
-        }
-    }
-    return 1;
-}
-
-/*  int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
- *
- *  Compare beginning of string b0 with a block of memory of length len 
- *  without differentiating between case for equality.  If the beginning of b0
- *  differs from the memory block other than in case (or if b0 is too short), 
- *  0 is returned, if the strings are the same, 1 is returned, if there is an 
- *  error, -1 is returned.  '\0' characters are not treated in any special 
- *  way.
- */
-int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
-int i;
-
-    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-        return BSTR_ERR;
-    if (b0->slen < len) return BSTR_OK;
-    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
-
-    for (i = 0; i < len; i ++) {
-        if (b0->data[i] != ((const unsigned char *) blk)[i]) {
-            if (downcase (b0->data[i]) != 
-                downcase (((const unsigned char *) blk)[i])) return 0;
-        }
-    }
-    return 1;
-}
-
-/*
- * int bltrimws (bstring b)
- *
- * Delete whitespace contiguous from the left end of the string.
- */
-int bltrimws (bstring b) {
-int i, len;
-
-    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-
-    for (len = b->slen, i = 0; i < len; i++) {
-        if (!wspace (b->data[i])) {
-            return bdelete (b, 0, i);
-        }
-    }
-
-    b->data[0] = (unsigned char) '\0';
-    b->slen = 0;
-    return BSTR_OK;
-}
-
-/*
- * int brtrimws (bstring b)
- *
- * Delete whitespace contiguous from the right end of the string.
- */
-int brtrimws (bstring b) {
-int i;
-
-    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-
-    for (i = b->slen - 1; i >= 0; i--) {
-        if (!wspace (b->data[i])) {
-            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-            b->slen = i + 1;
-            return BSTR_OK;
-        }
-    }
-
-    b->data[0] = (unsigned char) '\0';
-    b->slen = 0;
-    return BSTR_OK;
-}
-
-/*
- * int btrimws (bstring b)
- *
- * Delete whitespace contiguous from both ends of the string.
- */
-int btrimws (bstring b) {
-int i, j;
-
-    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-
-    for (i = b->slen - 1; i >= 0; i--) {
-        if (!wspace (b->data[i])) {
-            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-            b->slen = i + 1;
-            for (j = 0; wspace (b->data[j]); j++) {}
-            return bdelete (b, 0, j);
-        }
-    }
-
-    b->data[0] = (unsigned char) '\0';
-    b->slen = 0;
-    return BSTR_OK;
-}
-
-/*  int biseq (const_bstring b0, const_bstring b1)
- *
- *  Compare the string b0 and b1.  If the strings differ, 0 is returned, if 
- *  the strings are the same, 1 is returned, if there is an error, -1 is 
- *  returned.  If the length of the strings are different, this function is
- *  O(1).  '\0' termination characters are not treated in any special way.
- */
-int biseq (const_bstring b0, const_bstring b1) {
-    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-        b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
-    if (b0->slen != b1->slen) return BSTR_OK;
-    if (b0->data == b1->data || b0->slen == 0) return 1;
-    return !bstr__memcmp (b0->data, b1->data, b0->slen);
-}
-
-/*  int bisstemeqblk (const_bstring b0, const void * blk, int len)
- *
- *  Compare beginning of string b0 with a block of memory of length len for 
- *  equality.  If the beginning of b0 differs from the memory block (or if b0 
- *  is too short), 0 is returned, if the strings are the same, 1 is returned, 
- *  if there is an error, -1 is returned.  '\0' characters are not treated in 
- *  any special way.
- */
-int bisstemeqblk (const_bstring b0, const void * blk, int len) {
-int i;
-
-    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-        return BSTR_ERR;
-    if (b0->slen < len) return BSTR_OK;
-    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
-
-    for (i = 0; i < len; i ++) {
-        if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
-    }
-    return 1;
-}
-
-/*  int biseqcstr (const_bstring b, const char *s)
- *
- *  Compare the bstring b and char * string s.  The C string s must be '\0' 
- *  terminated at exactly the length of the bstring b, and the contents 
- *  between the two must be identical with the bstring b with no '\0' 
- *  characters for the two contents to be considered equal.  This is 
- *  equivalent to the condition that their current contents will be always be 
- *  equal when comparing them in the same format after converting one or the 
- *  other.  If the strings are equal 1 is returned, if they are unequal 0 is 
- *  returned and if there is a detectable error BSTR_ERR is returned.
- */
-int biseqcstr (const_bstring b, const char * s) {
-int i;
-    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-    for (i=0; i < b->slen; i++) {
-        if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
-    }
-    return s[i] == '\0';
-}
-
-/*  int biseqcstrcaseless (const_bstring b, const char *s)
- *
- *  Compare the bstring b and char * string s.  The C string s must be '\0' 
- *  terminated at exactly the length of the bstring b, and the contents 
- *  between the two must be identical except for case with the bstring b with 
- *  no '\0' characters for the two contents to be considered equal.  This is 
- *  equivalent to the condition that their current contents will be always be 
- *  equal ignoring case when comparing them in the same format after 
- *  converting one or the other.  If the strings are equal, except for case, 
- *  1 is returned, if they are unequal regardless of case 0 is returned and 
- *  if there is a detectable error BSTR_ERR is returned.
- */
-int biseqcstrcaseless (const_bstring b, const char * s) {
-int i;
-    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-    for (i=0; i < b->slen; i++) {
-        if (s[i] == '\0' || 
-            (b->data[i] != (unsigned char) s[i] && 
-             downcase (b->data[i]) != (unsigned char) downcase (s[i])))
-            return BSTR_OK;
-    }
-    return s[i] == '\0';
-}
-
-/*  int bstrcmp (const_bstring b0, const_bstring b1)
- *
- *  Compare the string b0 and b1.  If there is an error, SHRT_MIN is returned, 
- *  otherwise a value less than or greater than zero, indicating that the 
- *  string pointed to by b0 is lexicographically less than or greater than 
- *  the string pointed to by b1 is returned.  If the the string lengths are 
- *  unequal but the characters up until the length of the shorter are equal 
- *  then a value less than, or greater than zero, indicating that the string 
- *  pointed to by b0 is shorter or longer than the string pointed to by b1 is 
- *  returned.  0 is returned if and only if the two strings are the same.  If 
- *  the length of the strings are different, this function is O(n).  Like its
- *  standard C library counter part strcmp, the comparison does not proceed 
- *  past any '\0' termination characters encountered.
- */
-int bstrcmp (const_bstring b0, const_bstring b1) {
-int i, v, n;
-
-    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-    n = b0->slen; if (n > b1->slen) n = b1->slen;
-    if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
-        return BSTR_OK;
-
-    for (i = 0; i < n; i ++) {
-        v = ((char) b0->data[i]) - ((char) b1->data[i]);
-        if (v != 0) return v;
-        if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-    }
-
-    if (b0->slen > n) return 1;
-    if (b1->slen > n) return -1;
-    return BSTR_OK;
-}
-
-/*  int bstrncmp (const_bstring b0, const_bstring b1, int n)
- *
- *  Compare the string b0 and b1 for at most n characters.  If there is an 
- *  error, SHRT_MIN is returned, otherwise a value is returned as if b0 and 
- *  b1 were first truncated to at most n characters then bstrcmp was called
- *  with these new strings are paremeters.  If the length of the strings are 
- *  different, this function is O(n).  Like its standard C library counter 
- *  part strcmp, the comparison does not proceed past any '\0' termination 
- *  characters encountered.
- */
-int bstrncmp (const_bstring b0, const_bstring b1, int n) {
-int i, v, m;
-
-    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-    m = n;
-    if (m > b0->slen) m = b0->slen;
-    if (m > b1->slen) m = b1->slen;
-
-    if (b0->data != b1->data) {
-        for (i = 0; i < m; i ++) {
-            v = ((char) b0->data[i]) - ((char) b1->data[i]);
-            if (v != 0) return v;
-            if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-        }
-    }
-
-    if (n == m || b0->slen == b1->slen) return BSTR_OK;
-
-    if (b0->slen > m) return 1;
-    return -1;
-}
-
-/*  bstring bmidstr (const_bstring b, int left, int len)
- *
- *  Create a bstring which is the substring of b starting from position left
- *  and running for a length len (clamped by the end of the bstring b.)  If
- *  b is detectably invalid, then NULL is returned.  The section described 
- *  by (left, len) is clamped to the boundaries of b.
- */
-bstring bmidstr (const_bstring b, int left, int len) {
-
-    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
-
-    if (left < 0) {
-        len += left;
-        left = 0;
-    }
-
-    if (len > b->slen - left) len = b->slen - left;
-
-    if (len <= 0) return bfromcstr ("");
-    return blk2bstr (b->data + left, len);
-}
-
-/*  int bdelete (bstring b, int pos, int len)
- *
- *  Removes characters from pos to pos+len-1 inclusive and shifts the tail of 
- *  the bstring starting from pos+len to pos.  len must be positive for this 
- *  call to have any effect.  The section of the string described by (pos, 
- *  len) is clamped to boundaries of the bstring b.
- */
-int bdelete (bstring b, int pos, int len) {
-    /* Clamp to left side of bstring */
-    if (pos < 0) {
-        len += pos;
-        pos = 0;
-    }
-
-    if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
-        b->mlen < b->slen || b->mlen <= 0) 
-        return BSTR_ERR;
-    if (len > 0 && pos < b->slen) {
-        if (pos + len >= b->slen) {
-            b->slen = pos;
-        } else {
-            bBlockCopy ((char *) (b->data + pos),
-                        (char *) (b->data + pos + len), 
-                        b->slen - (pos+len));
-            b->slen -= len;
-        }
-        b->data[b->slen] = (unsigned char) '\0';
-    }
-    return BSTR_OK;
-}
-
-/*  int bdestroy (bstring b)
- *
- *  Free up the bstring.  Note that if b is detectably invalid or not writable
- *  then no action is performed and BSTR_ERR is returned.  Like a freed memory
- *  allocation, dereferences, writes or any other action on b after it has 
- *  been bdestroyed is undefined.
- */
-int bdestroy (bstring b) {
-    if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
-        b->data == NULL)
-        return BSTR_ERR;
-
-    bstr__free (b->data);
-
-    /* In case there is any stale usage, there is one more chance to 
-       notice this error. */
-
-    b->slen = -1;
-    b->mlen = -__LINE__;
-    b->data = NULL;
-
-    bstr__free (b);
-    return BSTR_OK;
-}
-
-/*  int binstr (const_bstring b1, int pos, const_bstring b2)
- *
- *  Search for the bstring b2 in b1 starting from position pos, and searching 
- *  forward.  If it is found then return with the first position where it is 
- *  found, otherwise return BSTR_ERR.  Note that this is just a brute force 
- *  string searcher that does not attempt clever things like the Boyer-Moore 
- *  search algorithm.  Because of this there are many degenerate cases where 
- *  this can take much longer than it needs to.
- */
-int binstr (const_bstring b1, int pos, const_bstring b2) {
-int j, ii, ll, lf;
-unsigned char * d0;
-unsigned char c0;
-register unsigned char * d1;
-register unsigned char c1;
-register int i;
-
-    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-    if (b1->slen < pos || pos < 0) return BSTR_ERR;
-    if (b2->slen == 0) return pos;
-
-    /* No space to find such a string? */
-    if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
-
-    /* An obvious alias case */
-    if (b1->data == b2->data && pos == 0) return 0;
-
-    i = pos;
-
-    d0 = b2->data;
-    d1 = b1->data;
-    ll = b2->slen;
-
-    /* Peel off the b2->slen == 1 case */
-    c0 = d0[0];
-    if (1 == ll) {
-        for (;i < lf; i++) if (c0 == d1[i]) return i;
-        return BSTR_ERR;
-    }
-
-    c1 = c0;
-    j = 0;
-    lf = b1->slen - 1;
-
-    ii = -1;
-    if (i < lf) do {
-        /* Unrolled current character test */
-        if (c1 != d1[i]) {
-            if (c1 != d1[1+i]) {
-                i += 2;
-                continue;
-            }
-            i++;
-        }
-
-        /* Take note if this is the start of a potential match */
-        if (0 == j) ii = i;
-
-        /* Shift the test character down by one */
-        j++;
-        i++;
-
-        /* If this isn't past the last character continue */
-        if (j < ll) {
-            c1 = d0[j];
-            continue;
-        }
-
-        N0:;
-
-        /* If no characters mismatched, then we matched */
-        if (i == ii+j) return ii;
-
-        /* Shift back to the beginning */
-        i -= j;
-        j  = 0;
-        c1 = c0;
-    } while (i < lf);
-
-    /* Deal with last case if unrolling caused a misalignment */
-    if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
-
-    return BSTR_ERR;
-}
-
-/*  int binstrr (const_bstring b1, int pos, const_bstring b2)
- *
- *  Search for the bstring b2 in b1 starting from position pos, and searching 
- *  backward.  If it is found then return with the first position where it is 
- *  found, otherwise return BSTR_ERR.  Note that this is just a brute force 
- *  string searcher that does not attempt clever things like the Boyer-Moore 
- *  search algorithm.  Because of this there are many degenerate cases where 
- *  this can take much longer than it needs to.
- */
-int binstrr (const_bstring b1, int pos, const_bstring b2) {
-int j, i, l;
-unsigned char * d0, * d1;
-
-    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-    if (b1->slen == pos && b2->slen == 0) return pos;
-    if (b1->slen < pos || pos < 0) return BSTR_ERR;
-    if (b2->slen == 0) return pos;
-
-    /* Obvious alias case */
-    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
-
-    i = pos;
-    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
-
-    /* If no space to find such a string then snap back */
-    if (l + 1 <= i) i = l;
-    j = 0;
-
-    d0 = b2->data;
-    d1 = b1->data;
-    l  = b2->slen;
-
-    for (;;) {
-        if (d0[j] == d1[i + j]) {
-            j ++;
-            if (j >= l) return i;
-        } else {
-            i --;
-            if (i < 0) break;
-            j=0;
-        }
-    }
-
-    return BSTR_ERR;
-}
-
-/*  int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
- *
- *  Search for the bstring b2 in b1 starting from position pos, and searching 
- *  forward but without regard to case.  If it is found then return with the 
- *  first position where it is found, otherwise return BSTR_ERR.  Note that 
- *  this is just a brute force string searcher that does not attempt clever 
- *  things like the Boyer-Moore search algorithm.  Because of this there are 
- *  many degenerate cases where this can take much longer than it needs to.
- */
-int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
-int j, i, l, ll;
-unsigned char * d0, * d1;
-
-    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-    if (b1->slen < pos || pos < 0) return BSTR_ERR;
-    if (b2->slen == 0) return pos;
-
-    l = b1->slen - b2->slen + 1;
-
-    /* No space to find such a string? */
-    if (l <= pos) return BSTR_ERR;
-
-    /* An obvious alias case */
-    if (b1->data == b2->data && pos == 0) return BSTR_OK;
-
-    i = pos;
-    j = 0;
-
-    d0 = b2->data;
-    d1 = b1->data;
-    ll = b2->slen;
-
-    for (;;) {
-        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-            j ++;
-            if (j >= ll) return i;
-        } else {
-            i ++;
-            if (i >= l) break;
-            j=0;
-        }
-    }
-
-    return BSTR_ERR;
-}
-
-/*  int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
- *
- *  Search for the bstring b2 in b1 starting from position pos, and searching 
- *  backward but without regard to case.  If it is found then return with the 
- *  first position where it is found, otherwise return BSTR_ERR.  Note that 
- *  this is just a brute force string searcher that does not attempt clever 
- *  things like the Boyer-Moore search algorithm.  Because of this there are 
- *  many degenerate cases where this can take much longer than it needs to.
- */
-int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
-int j, i, l;
-unsigned char * d0, * d1;
-
-    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-    if (b1->slen == pos && b2->slen == 0) return pos;
-    if (b1->slen < pos || pos < 0) return BSTR_ERR;
-    if (b2->slen == 0) return pos;
-
-    /* Obvious alias case */
-    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
-
-    i = pos;
-    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
-
-    /* If no space to find such a string then snap back */
-    if (l + 1 <= i) i = l;
-    j = 0;
-
-    d0 = b2->data;
-    d1 = b1->data;
-    l  = b2->slen;
-
-    for (;;) {
-        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-            j ++;
-            if (j >= l) return i;
-        } else {
-            i --;
-            if (i < 0) break;
-            j=0;
-        }
-    }
-
-    return BSTR_ERR;
-}
-
-
-/*  int bstrchrp (const_bstring b, int c, int pos)
- *
- *  Search for the character c in b forwards from the position pos 
- *  (inclusive).
- */
-int bstrchrp (const_bstring b, int c, int pos) {
-unsigned char * p;
-
-    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-    p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
-    if (p) return (int) (p - b->data);
-    return BSTR_ERR;
-}
-
-/*  int bstrrchrp (const_bstring b, int c, int pos)
- *
- *  Search for the character c in b backwards from the position pos in string 
- *  (inclusive).
- */
-int bstrrchrp (const_bstring b, int c, int pos) {
-int i;
- 
-    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-    for (i=pos; i >= 0; i--) {
-        if (b->data[i] == (unsigned char) c) return i;
-    }
-    return BSTR_ERR;
-}
-
-#if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
-#define LONG_LOG_BITS_QTY (3)
-#define LONG_BITS_QTY (1 << LONG_LOG_BITS_QTY)
-#define LONG_TYPE unsigned char
-
-#define CFCLEN ((1 << CHAR_BIT) / LONG_BITS_QTY)
-struct charField { LONG_TYPE content[CFCLEN]; };
-#define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
-#define setInCharField(cf,idx) { \
-    unsigned int c = (unsigned int) (idx); \
-    (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
-}
-
-#else
-
-#define CFCLEN (1 << CHAR_BIT)
-struct charField { unsigned char content[CFCLEN]; };
-#define testInCharField(cf,c) ((cf)->content[(unsigned char) (c)])
-#define setInCharField(cf,idx) (cf)->content[(unsigned int) (idx)] = ~0
-
-#endif
-
-/* Convert a bstring to charField */
-static int buildCharField (struct charField * cf, const_bstring b) {
-int i;
-    if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
-    memset ((void *) cf->content, 0, sizeof (struct charField));
-    for (i=0; i < b->slen; i++) {
-        setInCharField (cf, b->data[i]);
-    }
-    return BSTR_OK;
-}
-
-static void invertCharField (struct charField * cf) {
-int i;
-    for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
-}
-
-/* Inner engine for binchr */
-static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
-int i;
-    for (i=pos; i < len; i++) {
-        unsigned char c = (unsigned char) data[i];
-        if (testInCharField (cf, c)) return i;
-    }
-    return BSTR_ERR;
-}
-
-/*  int binchr (const_bstring b0, int pos, const_bstring b1);
- *
- *  Search for the first position in b0 starting from pos or after, in which 
- *  one of the characters in b1 is found and return it.  If such a position 
- *  does not exist in b0, then BSTR_ERR is returned.
- */
-int binchr (const_bstring b0, int pos, const_bstring b1) {
-struct charField chrs;
-    if (pos < 0 || b0 == NULL || b0->data == NULL ||
-        b0->slen <= pos) return BSTR_ERR;
-    if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
-    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-    return binchrCF (b0->data, b0->slen, pos, &chrs);
-}
-
-/* Inner engine for binchrr */
-static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
-int i;
-    for (i=pos; i >= 0; i--) {
-        unsigned int c = (unsigned int) data[i];
-        if (testInCharField (cf, c)) return i;
-    }
-    return BSTR_ERR;
-}
-
-/*  int binchrr (const_bstring b0, int pos, const_bstring b1);
- *
- *  Search for the last position in b0 no greater than pos, in which one of 
- *  the characters in b1 is found and return it.  If such a position does not 
- *  exist in b0, then BSTR_ERR is returned.
- */
-int binchrr (const_bstring b0, int pos, const_bstring b1) {
-struct charField chrs;
-    if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
-        b0->slen < pos) return BSTR_ERR;
-    if (pos == b0->slen) pos--;
-    if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
-    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-    return binchrrCF (b0->data, pos, &chrs);
-}
-
-/*  int bninchr (const_bstring b0, int pos, const_bstring b1);
- *
- *  Search for the first position in b0 starting from pos or after, in which 
- *  none of the characters in b1 is found and return it.  If such a position 
- *  does not exist in b0, then BSTR_ERR is returned.
- */
-int bninchr (const_bstring b0, int pos, const_bstring b1) {
-struct charField chrs;
-    if (pos < 0 || b0 == NULL || b0->data == NULL || 
-        b0->slen <= pos) return BSTR_ERR;
-    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-    invertCharField (&chrs);
-    return binchrCF (b0->data, b0->slen, pos, &chrs);
-}
-
-/*  int bninchrr (const_bstring b0, int pos, const_bstring b1);
- *
- *  Search for the last position in b0 no greater than pos, in which none of 
- *  the characters in b1 is found and return it.  If such a position does not 
- *  exist in b0, then BSTR_ERR is returned.
- */
-int bninchrr (const_bstring b0, int pos, const_bstring b1) {
-struct charField chrs;
-    if (pos < 0 || b0 == NULL || b0->data == NULL || 
-        b0->slen < pos) return BSTR_ERR;
-    if (pos == b0->slen) pos--;
-    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-    invertCharField (&chrs);
-    return binchrrCF (b0->data, pos, &chrs);
-}
-
-/*  int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
- *
- *  Overwrite the string b0 starting at position pos with the string b1. If 
- *  the position pos is past the end of b0, then the character "fill" is 
- *  appended as necessary to make up the gap between the end of b0 and pos.
- *  If b1 is NULL, it behaves as if it were a 0-length string.
- */
-int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill) {
-int d, newlen;
-ptrdiff_t pd;
-bstring aux = (bstring) b1;
-
-    if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
-        b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
-    if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
-
-    d = pos;
-
-    /* Aliasing case */
-    if (NULL != aux) {
-        if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
-            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-        }
-        d += aux->slen;
-    }
-
-    /* Increase memory size if necessary */
-    if (balloc (b0, d + 1) != BSTR_OK) {
-        if (aux != b1) bdestroy (aux);
-        return BSTR_ERR;
-    }
-
-    newlen = b0->slen;
-
-    /* Fill in "fill" character as necessary */
-    if (pos > newlen) {
-        bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
-        newlen = pos;
-    }
-
-    /* Copy b1 to position pos in b0. */
-    if (aux != NULL) {
-        bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
-        if (aux != b1) bdestroy (aux);
-    }
-
-    /* Indicate the potentially increased size of b0 */
-    if (d > newlen) newlen = d;
-
-    b0->slen = newlen;
-    b0->data[newlen] = (unsigned char) '\0';
-
-    return BSTR_OK;
-}
-
-/*  int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
- *
- *  Inserts the string b2 into b1 at position pos.  If the position pos is 
- *  past the end of b1, then the character "fill" is appended as necessary to 
- *  make up the gap between the end of b1 and pos.  Unlike bsetstr, binsert
- *  does not allow b2 to be NULL.
- */
-int binsert (bstring b1, int pos, const_bstring b2, unsigned char fill) {
-int d, l;
-ptrdiff_t pd;
-bstring aux = (bstring) b2;
-
-    if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
-        b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
-
-    /* Aliasing case */
-    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
-        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-    }
-
-    /* Compute the two possible end pointers */
-    d = b1->slen + aux->slen;
-    l = pos + aux->slen;
-    if ((d|l) < 0) return BSTR_ERR;
-
-    if (l > d) {
-        /* Inserting past the end of the string */
-        if (balloc (b1, l + 1) != BSTR_OK) {
-            if (aux != b2) bdestroy (aux);
-            return BSTR_ERR;
-        }
-        bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
-        b1->slen = l;
-    } else {
-        /* Inserting in the middle of the string */
-        if (balloc (b1, d + 1) != BSTR_OK) {
-            if (aux != b2) bdestroy (aux);
-            return BSTR_ERR;
-        }
-        bBlockCopy (b1->data + l, b1->data + pos, d - l);
-        b1->slen = d;
-    }
-    bBlockCopy (b1->data + pos, aux->data, aux->slen);
-    b1->data[b1->slen] = (unsigned char) '\0';
-    if (aux != b2) bdestroy (aux);
-    return BSTR_OK;
-}
-
-/*  int breplace (bstring b1, int pos, int len, bstring b2, 
- *                unsigned char fill)
- *
- *  Replace a section of a string from pos for a length len with the string b2.
- *  fill is used is pos > b1->slen.
- */
-int breplace (bstring b1, int pos, int len, const_bstring b2, 
-              unsigned char fill) {
-int pl, ret;
-ptrdiff_t pd;
-bstring aux = (bstring) b2;
-
-    if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
-        b2 == NULL || b1->data == NULL || b2->data == NULL || 
-        b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
-        b1->mlen <= 0) return BSTR_ERR;
-
-    /* Straddles the end? */
-    if (pl >= b1->slen) {
-        if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
-        if (pos + b2->slen < b1->slen) {
-            b1->slen = pos + b2->slen;
-            b1->data[b1->slen] = (unsigned char) '\0';
-        }
-        return ret;
-    }
-
-    /* Aliasing case */
-    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
-        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-    }
-
-    if (aux->slen > len) {
-        if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
-            if (aux != b2) bdestroy (aux);
-            return BSTR_ERR;
-        }
-    }
-
-    if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
-    bstr__memcpy (b1->data + pos, aux->data, aux->slen);
-    b1->slen += aux->slen - len;
-    b1->data[b1->slen] = (unsigned char) '\0';
-    if (aux != b2) bdestroy (aux);
-    return BSTR_OK;
-}
-
-/*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
- *                    int pos)
- *
- *  Replace all occurrences of a find string with a replace string after a
- *  given point in a bstring.
- */
-
-typedef int (*instr_fnptr) (const_bstring s1, int pos, const_bstring s2);
-
-static int findreplaceengine (bstring b, const_bstring find, const_bstring repl, int pos, instr_fnptr instr) {
-int i, ret, slen, mlen, delta, acc;
-int * d;
-int static_d[32];
-ptrdiff_t pd;
-bstring auxf = (bstring) find;
-bstring auxr = (bstring) repl;
-
-    if (b == NULL || b->data == NULL || find == NULL ||
-        find->data == NULL || repl == NULL || repl->data == NULL || 
-        pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
-        b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
-    if (pos > b->slen - find->slen) return BSTR_OK;
-
-    /* Alias with find string */
-    pd = (ptrdiff_t) (find->data - b->data);
-    if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
-        if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
-    }
-
-    /* Alias with repl string */
-    pd = (ptrdiff_t) (repl->data - b->data);
-    if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
-        if (NULL == (auxr = bstrcpy (repl))) {
-            if (auxf != find) bdestroy (auxf);
-            return BSTR_ERR;
-        }
-    }
-
-    delta = auxf->slen - auxr->slen;
-
-    /* in-place replacement since find and replace strings are of equal 
-       length */
-    if (delta == 0) {
-        while ((pos = instr (b, pos, auxf)) >= 0) {
-            bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
-            pos += auxf->slen;
-        }
-        if (auxf != find) bdestroy (auxf);
-        if (auxr != repl) bdestroy (auxr);
-        return BSTR_OK;
-    }
-
-    /* shrinking replacement since auxf->slen > auxr->slen */
-    if (delta > 0) {
-        acc = 0;
-
-        while ((i = instr (b, pos, auxf)) >= 0) {
-            if (acc && i > pos)
-                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-            if (auxr->slen)
-                bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
-            acc += delta;
-            pos = i + auxf->slen;
-        }
-
-        if (acc) {
-            i = b->slen;
-            if (i > pos)
-                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-            b->slen -= acc;
-            b->data[b->slen] = (unsigned char) '\0';
-        }
-
-        if (auxf != find) bdestroy (auxf);
-        if (auxr != repl) bdestroy (auxr);
-        return BSTR_OK;
-    }
-
-    /* expanding replacement since find->slen < repl->slen.  Its a lot 
-       more complicated. */
-
-    mlen = 32;
-    d = (int *) static_d; /* Avoid malloc for trivial cases */
-    acc = slen = 0;
-
-    while ((pos = instr (b, pos, auxf)) >= 0) {
-        if (slen + 1 >= mlen) {
-            int sl;
-            int * t;
-            mlen += mlen;
-            sl = sizeof (int *) * mlen;
-            if (static_d == d) d = NULL;
-            if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
-                ret = BSTR_ERR;
-                goto done;
-            }
-            if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
-            d = t;
-        }
-        d[slen] = pos;
-        slen++;
-        acc -= delta;
-        pos += auxf->slen;
-        if (pos < 0 || acc < 0) {
-            ret = BSTR_ERR;
-            goto done;
-        }
-    }
-    d[slen] = b->slen;
-
-    if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
-        b->slen += acc;
-        for (i = slen-1; i >= 0; i--) {
-            int s, l;
-            s = d[i] + auxf->slen;
-            l = d[i+1] - s;
-            if (l) {
-                bstr__memmove (b->data + s + acc, b->data + s, l);
-            }
-            if (auxr->slen) {
-                bstr__memmove (b->data + s + acc - auxr->slen, 
-                         auxr->data, auxr->slen);
-            }
-            acc += delta;        
-        }
-        b->data[b->slen] = (unsigned char) '\0';
-    }
-
-    done:;
-    if (static_d == d) d = NULL;
-    bstr__free (d);
-    if (auxf != find) bdestroy (auxf);
-    if (auxr != repl) bdestroy (auxr);
-    return ret;
-}
-
-/*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
- *                    int pos)
- *
- *  Replace all occurrences of a find string with a replace string after a
- *  given point in a bstring.
- */
-int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
-    return findreplaceengine (b, find, repl, pos, binstr);
-}
-
-/*  int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, 
- *                    int pos)
- *
- *  Replace all occurrences of a find string, ignoring case, with a replace 
- *  string after a given point in a bstring.
- */
-int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
-    return findreplaceengine (b, find, repl, pos, binstrcaseless);
-}
-
-/*  int binsertch (bstring b, int pos, int len, unsigned char fill)
- *
- *  Inserts the character fill repeatedly into b at position pos for a 
- *  length len.  If the position pos is past the end of b, then the 
- *  character "fill" is appended as necessary to make up the gap between the 
- *  end of b and the position pos + len.
- */
-int binsertch (bstring b, int pos, int len, unsigned char fill) {
-int d, l, i;
-
-    if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
-        b->mlen <= 0 || len < 0) return BSTR_ERR;
-
-    /* Compute the two possible end pointers */
-    d = b->slen + len;
-    l = pos + len;
-    if ((d|l) < 0) return BSTR_ERR;
-
-    if (l > d) {
-        /* Inserting past the end of the string */
-        if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
-        pos = b->slen;
-        b->slen = l;
-    } else {
-        /* Inserting in the middle of the string */
-        if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
-        for (i = d - 1; i >= l; i--) {
-            b->data[i] = b->data[i - len];
-        }
-        b->slen = d;
-    }
-
-    for (i=pos; i < l; i++) b->data[i] = fill;
-    b->data[b->slen] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  int bpattern (bstring b, int len)
- *
- *  Replicate the bstring, b in place, end to end repeatedly until it 
- *  surpasses len characters, then chop the result to exactly len characters. 
- *  This function operates in-place.  The function will return with BSTR_ERR 
- *  if b is NULL or of length 0, otherwise BSTR_OK is returned.
- */
-int bpattern (bstring b, int len) {
-int i, d;
-
-    d = blength (b);
-    if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
-    if (len > 0) {
-        if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
-        for (i = d; i < len; i++) b->data[i] = b->data[i - d];
-    }
-    b->data[len] = (unsigned char) '\0';
-    b->slen = len;
-    return BSTR_OK;
-}
-
-#define BS_BUFF_SZ (1024)
-
-/*  int breada (bstring b, bNread readPtr, void * parm)
- *
- *  Use a finite buffer fread-like function readPtr to concatenate to the 
- *  bstring b the entire contents of file-like source data in a roughly 
- *  efficient way.
- */
-int breada (bstring b, bNread readPtr, void * parm) {
-int i, l, n;
-
-    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-        b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
-
-    i = b->slen;
-    for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
-        if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
-        l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
-        i += l;
-        b->slen = i;
-        if (i < n) break;
-    }
-
-    b->data[i] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  bstring bread (bNread readPtr, void * parm)
- *
- *  Use a finite buffer fread-like function readPtr to create a bstring 
- *  filled with the entire contents of file-like source data in a roughly 
- *  efficient way.
- */
-bstring bread (bNread readPtr, void * parm) {
-bstring buff;
-
-    if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
-        bdestroy (buff);
-        return NULL;
-    }
-    return buff;
-}
-
-/*  int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
- *
- *  Use an fgetc-like single character stream reading function (getcPtr) to 
- *  obtain a sequence of characters which are concatenated to the end of the
- *  bstring b.  The stream read is terminated by the passed in terminator 
- *  parameter.
- *
- *  If getcPtr returns with a negative number, or the terminator character 
- *  (which is appended) is read, then the stream reading is halted and the 
- *  function returns with a partial result in b.  If there is an empty partial
- *  result, 1 is returned.  If no characters are read, or there is some other 
- *  detectable error, BSTR_ERR is returned.
- */
-int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
-int c, d, e;
-
-    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-    d = 0;
-    e = b->mlen - 2;
-
-    while ((c = getcPtr (parm)) >= 0) {
-        if (d > e) {
-            b->slen = d;
-            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-            e = b->mlen - 2;
-        }
-        b->data[d] = (unsigned char) c;
-        d++;
-        if (c == terminator) break;
-    }
-
-    b->data[d] = (unsigned char) '\0';
-    b->slen = d;
-
-    return d == 0 && c < 0;
-}
-
-/*  int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
- *
- *  Use an fgetc-like single character stream reading function (getcPtr) to 
- *  obtain a sequence of characters which are concatenated to the end of the
- *  bstring b.  The stream read is terminated by the passed in terminator 
- *  parameter.
- *
- *  If getcPtr returns with a negative number, or the terminator character 
- *  (which is appended) is read, then the stream reading is halted and the 
- *  function returns with a partial result concatentated to b.  If there is 
- *  an empty partial result, 1 is returned.  If no characters are read, or 
- *  there is some other detectable error, BSTR_ERR is returned.
- */
-int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
-int c, d, e;
-
-    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-    d = b->slen;
-    e = b->mlen - 2;
-
-    while ((c = getcPtr (parm)) >= 0) {
-        if (d > e) {
-            b->slen = d;
-            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-            e = b->mlen - 2;
-        }
-        b->data[d] = (unsigned char) c;
-        d++;
-        if (c == terminator) break;
-    }
-
-    b->data[d] = (unsigned char) '\0';
-    b->slen = d;
-
-    return d == 0 && c < 0;
-}
-
-/*  bstring bgets (bNgetc getcPtr, void * parm, char terminator)
- *
- *  Use an fgetc-like single character stream reading function (getcPtr) to 
- *  obtain a sequence of characters which are concatenated into a bstring.  
- *  The stream read is terminated by the passed in terminator function.
- *
- *  If getcPtr returns with a negative number, or the terminator character 
- *  (which is appended) is read, then the stream reading is halted and the 
- *  result obtained thus far is returned.  If no characters are read, or 
- *  there is some other detectable error, NULL is returned.
- */
-bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
-bstring buff;
-
-    if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
-        bdestroy (buff);
-        buff = NULL;
-    }
-    return buff;
-}
-
-struct bStream {
-    bstring buff;        /* Buffer for over-reads */
-    void * parm;        /* The stream handle for core stream */
-    bNread readFnPtr;    /* fread compatible fnptr for core stream */
-    int isEOF;        /* track file's EOF state */
-    int maxBuffSz;
-};
-
-/*  struct bStream * bsopen (bNread readPtr, void * parm)
- *
- *  Wrap a given open stream (described by a fread compatible function 
- *  pointer and stream handle) into an open bStream suitable for the bstring 
- *  library streaming functions.
- */
-struct bStream * bsopen (bNread readPtr, void * parm) {
-struct bStream * s;
-
-    if (readPtr == NULL) return NULL;
-    s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
-    if (s == NULL) return NULL;
-    s->parm = parm;
-    s->buff = bfromcstr ("");
-    s->readFnPtr = readPtr;
-    s->maxBuffSz = BS_BUFF_SZ;
-    s->isEOF = 0;
-    return s;
-}
-
-/*  int bsbufflength (struct bStream * s, int sz)
- *
- *  Set the length of the buffer used by the bStream.  If sz is zero, the 
- *  length is not set.  This function returns with the previous length.
- */
-int bsbufflength (struct bStream * s, int sz) {
-int oldSz;
-    if (s == NULL || sz < 0) return BSTR_ERR;
-    oldSz = s->maxBuffSz;
-    if (sz > 0) s->maxBuffSz = sz;
-    return oldSz;
-}
-
-int bseof (const struct bStream * s) {
-    if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
-    return s->isEOF && (s->buff->slen == 0);
-}
-
-/*  void * bsclose (struct bStream * s)
- *
- *  Close the bStream, and return the handle to the stream that was originally
- *  used to open the given stream.
- */
-void * bsclose (struct bStream * s) {
-void * parm;
-    if (s == NULL) return NULL;
-    s->readFnPtr = NULL;
-    if (s->buff) bdestroy (s->buff);
-    s->buff = NULL;
-    parm = s->parm;
-    s->parm = NULL;
-    s->isEOF = 1;
-    bstr__free (s);
-    return parm;
-}
-
-/*  int bsreadlna (bstring r, struct bStream * s, char terminator)
- *
- *  Read a bstring terminated by the terminator character or the end of the
- *  stream from the bStream (s) and return it into the parameter r.  This 
- *  function may read additional characters from the core stream that are not 
- *  returned, but will be retained for subsequent read operations.
- */
-int bsreadlna (bstring r, struct bStream * s, char terminator) {
-int i, l, ret, rlo;
-char * b;
-struct tagbstring x;
-
-    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
-        r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
-    l = s->buff->slen;
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    b = (char *) s->buff->data;
-    x.data = (unsigned char *) b;
-
-    /* First check if the current buffer holds the terminator */
-    b[l] = terminator; /* Set sentinel */
-    for (i=0; b[i] != terminator; i++) ;
-    if (i < l) {
-        x.slen = i + 1;
-        ret = bconcat (r, &x);
-        s->buff->slen = l;
-        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-        return BSTR_OK;
-    }
-
-    rlo = r->slen;
-
-    /* If not then just concatenate the entire buffer to the output */
-    x.slen = l;
-    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-    /* Perform direct in-place reads into the destination to allow for
-       the minimum of data-copies */
-    for (;;) {
-        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-        b = (char *) (r->data + r->slen);
-        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-        if (l <= 0) {
-            r->data[r->slen] = (unsigned char) '\0';
-            s->buff->slen = 0;
-            s->isEOF = 1;
-            /* If nothing was read return with an error message */
-            return BSTR_ERR & -(r->slen == rlo);
-        }
-        b[l] = terminator; /* Set sentinel */
-        for (i=0; b[i] != terminator; i++) ;
-        if (i < l) break;
-        r->slen += l;
-    }
-
-    /* Terminator found, push over-read back to buffer */
-    i++;
-    r->slen += i;
-    s->buff->slen = l - i;
-    bstr__memcpy (s->buff->data, b + i, l - i);
-    r->data[r->slen] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  int bsreadlnsa (bstring r, struct bStream * s, bstring term)
- *
- *  Read a bstring terminated by any character in the term string or the end 
- *  of the stream from the bStream (s) and return it into the parameter r.  
- *  This function may read additional characters from the core stream that 
- *  are not returned, but will be retained for subsequent read operations.
- */
-int bsreadlnsa (bstring r, struct bStream * s, const_bstring term) {
-int i, l, ret, rlo;
-unsigned char * b;
-struct tagbstring x;
-struct charField cf;
-
-    if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
-        term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
-        r->mlen < r->slen) return BSTR_ERR;
-    if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
-    if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
-
-    l = s->buff->slen;
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    b = (unsigned char *) s->buff->data;
-    x.data = b;
-
-    /* First check if the current buffer holds the terminator */
-    b[l] = term->data[0]; /* Set sentinel */
-    for (i=0; !testInCharField (&cf, b[i]); i++) ;
-    if (i < l) {
-        x.slen = i + 1;
-        ret = bconcat (r, &x);
-        s->buff->slen = l;
-        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-        return BSTR_OK;
-    }
-
-    rlo = r->slen;
-
-    /* If not then just concatenate the entire buffer to the output */
-    x.slen = l;
-    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-    /* Perform direct in-place reads into the destination to allow for
-       the minimum of data-copies */
-    for (;;) {
-        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-        b = (unsigned char *) (r->data + r->slen);
-        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-        if (l <= 0) {
-            r->data[r->slen] = (unsigned char) '\0';
-            s->buff->slen = 0;
-            s->isEOF = 1;
-            /* If nothing was read return with an error message */
-            return BSTR_ERR & -(r->slen == rlo);
-        }
-
-        b[l] = term->data[0]; /* Set sentinel */
-        for (i=0; !testInCharField (&cf, b[i]); i++) ;
-        if (i < l) break;
-        r->slen += l;
-    }
-
-    /* Terminator found, push over-read back to buffer */
-    i++;
-    r->slen += i;
-    s->buff->slen = l - i;
-    bstr__memcpy (s->buff->data, b + i, l - i);
-    r->data[r->slen] = (unsigned char) '\0';
-    return BSTR_OK;
-}
-
-/*  int bsreada (bstring r, struct bStream * s, int n)
- *
- *  Read a bstring of length n (or, if it is fewer, as many bytes as is 
- *  remaining) from the bStream.  This function may read additional 
- *  characters from the core stream that are not returned, but will be 
- *  retained for subsequent read operations.  This function will not read
- *  additional characters from the core stream beyond virtual stream pointer.
- */
-int bsreada (bstring r, struct bStream * s, int n) {
-int l, ret, orslen;
-char * b;
-struct tagbstring x;
-
-    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-     || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
-
-    n += r->slen;
-    if (n <= 0) return BSTR_ERR;
-
-    l = s->buff->slen;
-
-    orslen = r->slen;
-
-    if (0 == l) {
-        if (s->isEOF) return BSTR_ERR;
-        if (r->mlen > n) {
-            l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
-            if (0 >= l || l > n - r->slen) {
-                s->isEOF = 1;
-                return BSTR_ERR;
-            }
-            r->slen += l;
-            r->data[r->slen] = (unsigned char) '\0';
-            return 0;
-        }
-    }
-
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    b = (char *) s->buff->data;
-    x.data = (unsigned char *) b;
-
-    do {
-        if (l + r->slen >= n) {
-            x.slen = n - r->slen;
-            ret = bconcat (r, &x);
-            s->buff->slen = l;
-            if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
-            return BSTR_ERR & -(r->slen == orslen);
-        }
-
-        x.slen = l;
-        if (BSTR_OK != bconcat (r, &x)) break;
-
-        l = n - r->slen;
-        if (l > s->maxBuffSz) l = s->maxBuffSz;
-
-        l = (int) s->readFnPtr (b, 1, l, s->parm);
-
-    } while (l > 0);
-    if (l < 0) l = 0;
-    if (l == 0) s->isEOF = 1;
-    s->buff->slen = l;
-    return BSTR_ERR & -(r->slen == orslen);
-}
-
-/*  int bsreadln (bstring r, struct bStream * s, char terminator)
- *
- *  Read a bstring terminated by the terminator character or the end of the
- *  stream from the bStream (s) and return it into the parameter r.  This 
- *  function may read additional characters from the core stream that are not 
- *  returned, but will be retained for subsequent read operations.
- */
-int bsreadln (bstring r, struct bStream * s, char terminator) {
-    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
-        return BSTR_ERR;
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    r->slen = 0;
-    return bsreadlna (r, s, terminator);
-}
-
-/*  int bsreadlns (bstring r, struct bStream * s, bstring term)
- *
- *  Read a bstring terminated by any character in the term string or the end 
- *  of the stream from the bStream (s) and return it into the parameter r.  
- *  This function may read additional characters from the core stream that 
- *  are not returned, but will be retained for subsequent read operations.
- */
-int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
-    if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
-     || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
-    if (term->slen == 1) return bsreadln (r, s, term->data[0]);
-    if (term->slen < 1) return BSTR_ERR;
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    r->slen = 0;
-    return bsreadlnsa (r, s, term);
-}
-
-/*  int bsread (bstring r, struct bStream * s, int n)
- *
- *  Read a bstring of length n (or, if it is fewer, as many bytes as is 
- *  remaining) from the bStream.  This function may read additional 
- *  characters from the core stream that are not returned, but will be 
- *  retained for subsequent read operations.  This function will not read
- *  additional characters from the core stream beyond virtual stream pointer.
- */
-int bsread (bstring r, struct bStream * s, int n) {
-    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-     || n <= 0) return BSTR_ERR;
-    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-    r->slen = 0;
-    return bsreada (r, s, n);
-}
-
-/*  int bsunread (struct bStream * s, const_bstring b)
- *
- *  Insert a bstring into the bStream at the current position.  These 
- *  characters will be read prior to those that actually come from the core 
- *  stream.
- */
-int bsunread (struct bStream * s, const_bstring b) {
-    if (s == NULL || s->buff == NULL) return BSTR_ERR;
-    return binsert (s->buff, 0, b, (unsigned char) '?');
-}
-
-/*  int bspeek (bstring r, const struct bStream * s)
- *
- *  Return the currently buffered characters from the bStream that will be 
- *  read prior to reads from the core stream.
- */
-int bspeek (bstring r, const struct bStream * s) {
-    if (s == NULL || s->buff == NULL) return BSTR_ERR;
-    return bassign (r, s->buff);
-}
-
-/*  bstring bjoin (const struct bstrList * bl, const_bstring sep);
- *
- *  Join the entries of a bstrList into one bstring by sequentially 
- *  concatenating them with the sep string in between.  If there is an error 
- *  NULL is returned, otherwise a bstring with the correct result is returned.
- */
-bstring bjoin (const struct bstrList * bl, const_bstring sep) {
-bstring b;
-int i, c, v;
-
-    if (bl == NULL || bl->qty < 0) return NULL;
-    if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
-
-    for (i = 0, c = 1; i < bl->qty; i++) {
-        v = bl->entry[i]->slen;
-        if (v < 0) return NULL;    /* Invalid input */
-        c += v;
-        if (c < 0) return NULL;    /* Wrap around ?? */
-    }
-
-    if (sep != NULL) c += (bl->qty - 1) * sep->slen;
-
-    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-    if (NULL == b) return NULL; /* Out of memory */
-    b->data = (unsigned char *) bstr__alloc (c);
-    if (b->data == NULL) {
-        bstr__free (b);
-        return NULL;
-    }
-
-    b->mlen = c;
-    b->slen = c-1;
-
-    for (i = 0, c = 0; i < bl->qty; i++) {
-        if (i > 0 && sep != NULL) {
-            bstr__memcpy (b->data + c, sep->data, sep->slen);
-            c += sep->slen;
-        }
-        v = bl->entry[i]->slen;
-        bstr__memcpy (b->data + c, bl->entry[i]->data, v);
-        c += v;
-    }
-    b->data[c] = (unsigned char) '\0';
-    return b;
-}
-
-#define BSSSC_BUFF_LEN (256)
-
-/*  int bssplitscb (struct bStream * s, const_bstring splitStr, 
- *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
- *
- *  Iterate the set of disjoint sequential substrings read from a stream 
- *  divided by any of the characters in splitStr.  An empty splitStr causes 
- *  the whole stream to be iterated once.
- *
- *  Note: At the point of calling the cb function, the bStream pointer is 
- *  pointed exactly at the position right after having read the split 
- *  character.  The cb function can act on the stream by causing the bStream
- *  pointer to move, and bssplitscb will continue by starting the next split
- *  at the position of the pointer after the return from cb.
- *
- *  However, if the cb causes the bStream s to be destroyed then the cb must
- *  return with a negative value, otherwise bssplitscb will continue in an 
- *  undefined manner.
- */
-int bssplitscb (struct bStream * s, const_bstring splitStr, 
-    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
-struct charField chrs;
-bstring buff;
-int i, p, ret;
-
-    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-    if (splitStr->slen == 0) {
-        while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
-        if ((ret = cb (parm, 0, buff)) > 0) 
-            ret = 0;
-    } else {
-        buildCharField (&chrs, splitStr);
-        ret = p = i = 0;
-        for (;;) {
-            if (i >= buff->slen) {
-                bsreada (buff, s, BSSSC_BUFF_LEN);
-                if (i >= buff->slen) {
-                    if (0 < (ret = cb (parm, p, buff))) ret = 0;
-                    break;
-                }
-            }
-            if (testInCharField (&chrs, buff->data[i])) {
-                struct tagbstring t;
-                unsigned char c;
-
-                blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
-                if ((ret = bsunread (s, &t)) < 0) break;
-                buff->slen = i;
-                c = buff->data[i];
-                buff->data[i] = (unsigned char) '\0';
-                if ((ret = cb (parm, p, buff)) < 0) break;
-                buff->data[i] = c;
-                buff->slen = 0;
-                p += i + 1;
-                i = -1;
-            }
-            i++;
-        }
-    }
-
-    bdestroy (buff);
-    return ret;
-}
-
-/*  int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
- *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
- *
- *  Iterate the set of disjoint sequential substrings read from a stream 
- *  divided by the entire substring splitStr.  An empty splitStr causes 
- *  each character of the stream to be iterated.
- *
- *  Note: At the point of calling the cb function, the bStream pointer is 
- *  pointed exactly at the position right after having read the split 
- *  character.  The cb function can act on the stream by causing the bStream
- *  pointer to move, and bssplitscb will continue by starting the next split
- *  at the position of the pointer after the return from cb.
- *
- *  However, if the cb causes the bStream s to be destroyed then the cb must
- *  return with a negative value, otherwise bssplitscb will continue in an 
- *  undefined manner.
- */
-int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
-    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
-bstring buff;
-int i, p, ret;
-
-    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-    if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
-
-    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-    if (splitStr->slen == 0) {
-        for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
-            if ((ret = cb (parm, 0, buff)) < 0) {
-                bdestroy (buff);
-                return ret;
-            }
-            buff->slen = 0;
-        }
-        return BSTR_OK;
-    } else {
-        ret = p = i = 0;
-        for (i=p=0;;) {
-            if ((ret = binstr (buff, 0, splitStr)) >= 0) {
-                struct tagbstring t;
-                blk2tbstr (t, buff->data, ret);
-                i = ret + splitStr->slen;
-                if ((ret = cb (parm, p, &t)) < 0) break;
-                p += i;
-                bdelete (buff, 0, i);
-            } else {
-                bsreada (buff, s, BSSSC_BUFF_LEN);
-                if (bseof (s)) {
-                    if ((ret = cb (parm, p, buff)) > 0) ret = 0;
-                    break;
-                }
-            }
-        }
-    }
-
-    bdestroy (buff);
-    return ret;
-}
-
-/*  int bstrListCreate (void)
- *
- *  Create a bstrList.
- */
-struct bstrList * bstrListCreate (void) {
-struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-    if (sl) {
-        sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
-        if (!sl->entry) {
-            bstr__free (sl);
-            sl = NULL;
-        } else {
-            sl->qty = 0;
-            sl->mlen = 1;
-        }
-    }
-    return sl;
-}
-
-/*  int bstrListDestroy (struct bstrList * sl)
- *
- *  Destroy a bstrList that has been created by bsplit, bsplits or bstrListCreate.
- */
-int bstrListDestroy (struct bstrList * sl) {
-int i;
-    if (sl == NULL || sl->qty < 0) return BSTR_ERR;
-    for (i=0; i < sl->qty; i++) {
-        if (sl->entry[i]) {
-            bdestroy (sl->entry[i]);
-            sl->entry[i] = NULL;
-        }
-    }
-    sl->qty  = -1;
-    sl->mlen = -1;
-    bstr__free (sl->entry);
-    sl->entry = NULL;
-    bstr__free (sl);
-    return BSTR_OK;
-}
-
-/*  int bstrListAlloc (struct bstrList * sl, int msz)
- *
- *  Ensure that there is memory for at least msz number of entries for the
- *  list.
- */
-int bstrListAlloc (struct bstrList * sl, int msz) {
-bstring * l;
-int smsz;
-size_t nsz;
-    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-    if (sl->mlen >= msz) return BSTR_OK;
-    smsz = snapUpSize (msz);
-    nsz = ((size_t) smsz) * sizeof (bstring);
-    if (nsz < (size_t) smsz) return BSTR_ERR;
-    l = (bstring *) bstr__realloc (sl->entry, nsz);
-    if (!l) {
-        smsz = msz;
-        nsz = ((size_t) smsz) * sizeof (bstring);
-        l = (bstring *) bstr__realloc (sl->entry, nsz);
-        if (!l) return BSTR_ERR;
-    }
-    sl->mlen = smsz;
-    sl->entry = l;
-    return BSTR_OK;
-}
-
-/*  int bstrListAllocMin (struct bstrList * sl, int msz)
- *
- *  Try to allocate the minimum amount of memory for the list to include at
- *  least msz entries or sl->qty whichever is greater.
- */
-int bstrListAllocMin (struct bstrList * sl, int msz) {
-bstring * l;
-size_t nsz;
-    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-    if (msz < sl->qty) msz = sl->qty;
-    if (sl->mlen == msz) return BSTR_OK;
-    nsz = ((size_t) msz) * sizeof (bstring);
-    if (nsz < (size_t) msz) return BSTR_ERR;
-    l = (bstring *) bstr__realloc (sl->entry, nsz);
-    if (!l) return BSTR_ERR;
-    sl->mlen = msz;
-    sl->entry = l;
-    return BSTR_OK;
-}
-
-/*  int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- *    int (* cb) (void * parm, int ofs, int len), void * parm)
- *
- *  Iterate the set of disjoint sequential substrings over str divided by the
- *  character in splitChar.
- *
- *  Note: Non-destructive modification of str from within the cb function 
- *  while performing this split is not undefined.  bsplitcb behaves in 
- *  sequential lock step with calls to cb.  I.e., after returning from a cb 
- *  that return a non-negative integer, bsplitcb continues from the position 
- *  1 character after the last detected split character and it will halt 
- *  immediately if the length of str falls below this point.  However, if the 
- *  cb function destroys str, then it *must* return with a negative value, 
- *  otherwise bsplitcb will continue in an undefined manner.
- */
-int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
-    int (* cb) (void * parm, int ofs, int len), void * parm) {
-int i, p, ret;
-
-    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
-        return BSTR_ERR;
-
-    p = pos;
-    do {
-        for (i=p; i < str->slen; i++) {
-            if (str->data[i] == splitChar) break;
-        }
-        if ((ret = cb (parm, p, i - p)) < 0) return ret;
-        p = i + 1;
-    } while (p <= str->slen);
-    return BSTR_OK;
-}
-
-/*  int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- *    int (* cb) (void * parm, int ofs, int len), void * parm)
- *
- *  Iterate the set of disjoint sequential substrings over str divided by any 
- *  of the characters in splitStr.  An empty splitStr causes the whole str to
- *  be iterated once.
- *
- *  Note: Non-destructive modification of str from within the cb function 
- *  while performing this split is not undefined.  bsplitscb behaves in 
- *  sequential lock step with calls to cb.  I.e., after returning from a cb 
- *  that return a non-negative integer, bsplitscb continues from the position 
- *  1 character after the last detected split character and it will halt 
- *  immediately if the length of str falls below this point.  However, if the 
- *  cb function destroys str, then it *must* return with a negative value, 
- *  otherwise bsplitscb will continue in an undefined manner.
- */
-int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
-    int (* cb) (void * parm, int ofs, int len), void * parm) {
-struct charField chrs;
-int i, p, ret;
-
-    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-    if (splitStr->slen == 0) {
-        if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
-        return ret;
-    }
-
-    if (splitStr->slen == 1) 
-        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
-
-    buildCharField (&chrs, splitStr);
-
-    p = pos;
-    do {
-        for (i=p; i < str->slen; i++) {
-            if (testInCharField (&chrs, str->data[i])) break;
-        }
-        if ((ret = cb (parm, p, i - p)) < 0) return ret;
-        p = i + 1;
-    } while (p <= str->slen);
-    return BSTR_OK;
-}
-
-/*  int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- *    int (* cb) (void * parm, int ofs, int len), void * parm)
- *
- *  Iterate the set of disjoint sequential substrings over str divided by the 
- *  substring splitStr.  An empty splitStr causes the whole str to be 
- *  iterated once.
- *
- *  Note: Non-destructive modification of str from within the cb function 
- *  while performing this split is not undefined.  bsplitstrcb behaves in 
- *  sequential lock step with calls to cb.  I.e., after returning from a cb 
- *  that return a non-negative integer, bsplitscb continues from the position 
- *  1 character after the last detected split character and it will halt 
- *  immediately if the length of str falls below this point.  However, if the 
- *  cb function destroys str, then it *must* return with a negative value, 
- *  otherwise bsplitscb will continue in an undefined manner.
- */
-int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
-    int (* cb) (void * parm, int ofs, int len), void * parm) {
-int i, p, ret;
-
-    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-    if (0 == splitStr->slen) {
-        for (i=pos; i < str->slen; i++) {
-            if ((ret = cb (parm, i, 1)) < 0) return ret;
-        }
-        return BSTR_OK;
-    }
-
-    if (splitStr->slen == 1) 
-        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
-
-    for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
-        if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
-            if ((ret = cb (parm, p, i - p)) < 0) return ret;
-            i += splitStr->slen;
-            p = i;
-        }
-    }
-    if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
-    return BSTR_OK;
-}
-
-struct genBstrList {
-    bstring b;
-    struct bstrList * bl;
-};
-
-static int bscb (void * parm, int ofs, int len) {
-struct genBstrList * g = (struct genBstrList *) parm;
-    if (g->bl->qty >= g->bl->mlen) {
-        int mlen = g->bl->mlen * 2;
-        bstring * tbl;
-
-        while (g->bl->qty >= mlen) {
-            if (mlen < g->bl->mlen) return BSTR_ERR;
-            mlen += mlen;
-        }
-
-        tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
-        if (tbl == NULL) return BSTR_ERR;
-
-        g->bl->entry = tbl;
-        g->bl->mlen = mlen;
-    }
-
-    g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
-    g->bl->qty++;
-    return BSTR_OK;
-}
-
-/*  struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
- *
- *  Create an array of sequential substrings from str divided by the character
- *  splitChar.  
- */
-struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
-struct genBstrList g;
-
-    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
-
-    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-    if (g.bl == NULL) return NULL;
-    g.bl->mlen = 4;
-    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-    if (NULL == g.bl->entry) {
-        bstr__free (g.bl);
-        return NULL;
-    }
-
-    g.b = (bstring) str;
-    g.bl->qty = 0;
-    if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
-        bstrListDestroy (g.bl);
-        return NULL;
-    }
-    return g.bl;
-}
-
-/*  struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
- *
- *  Create an array of sequential substrings from str divided by the entire
- *  substring splitStr.
- */
-struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
-struct genBstrList g;
-
-    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
-
-    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-    if (g.bl == NULL) return NULL;
-    g.bl->mlen = 4;
-    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-    if (NULL == g.bl->entry) {
-        bstr__free (g.bl);
-        return NULL;
-    }
-
-    g.b = (bstring) str;
-    g.bl->qty = 0;
-    if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
-        bstrListDestroy (g.bl);
-        return NULL;
-    }
-    return g.bl;
-}
-
-/*  struct bstrList * bsplits (const_bstring str, bstring splitStr)
- *
- *  Create an array of sequential substrings from str divided by any of the 
- *  characters in splitStr.  An empty splitStr causes a single entry bstrList
- *  containing a copy of str to be returned.
- */
-struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
-struct genBstrList g;
-
-    if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
-        splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
-        return NULL;
-
-    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-    if (g.bl == NULL) return NULL;
-    g.bl->mlen = 4;
-    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-    if (NULL == g.bl->entry) {
-        bstr__free (g.bl);
-        return NULL;
-    }
-    g.b = (bstring) str;
-    g.bl->qty = 0;
-
-    if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
-        bstrListDestroy (g.bl);
-        return NULL;
-    }
-    return g.bl;
-}
-
-#if defined (__TURBOC__) && !defined (__BORLANDC__)
-# ifndef BSTRLIB_NOVSNP
-#  define BSTRLIB_NOVSNP
-# endif
-#endif
-
-/* Give WATCOM C/C++, MSVC some latitude for their non-support of vsnprintf */
-#if defined(__WATCOMC__) || defined(_MSC_VER)
-#define exvsnprintf(r,b,n,f,a) {r = _vsnprintf (b,n,f,a);}
-#else
-#ifdef BSTRLIB_NOVSNP
-/* This is just a hack.  If you are using a system without a vsnprintf, it is 
-   not recommended that bformat be used at all. */
-#define exvsnprintf(r,b,n,f,a) {vsprintf (b,f,a); r = -1;}
-#define START_VSNBUFF (256)
-#else
-
-#ifdef __GNUC__
-/* Something is making gcc complain about this prototype not being here, so 
-   I've just gone ahead and put it in. */
-extern int vsnprintf (char *buf, size_t count, const char *format, va_list arg);
-#endif
-
-#define exvsnprintf(r,b,n,f,a) {r = vsnprintf (b,n,f,a);}
-#endif
-#endif
-
-#if !defined (BSTRLIB_NOVSNP)
-
-#ifndef START_VSNBUFF
-#define START_VSNBUFF (16)
-#endif
-
-/* On IRIX vsnprintf returns n-1 when the operation would overflow the target 
-   buffer, WATCOM and MSVC both return -1, while C99 requires that the 
-   returned value be exactly what the length would be if the buffer would be
-   large enough.  This leads to the idea that if the return value is larger 
-   than n, then changing n to the return value will reduce the number of
-   iterations required. */
-
-/*  int bformata (bstring b, const char * fmt, ...)
- *
- *  After the first parameter, it takes the same parameters as printf (), but 
- *  rather than outputting results to stdio, it appends the results to 
- *  a bstring which contains what would have been output. Note that if there 
- *  is an early generation of a '\0' character, the bstring will be truncated 
- *  to this end point.
- */
-int bformata (bstring b, const char * fmt, ...) {
-va_list arglist;
-bstring buff;
-int n, r;
-
-    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
-
-    /* Since the length is not determinable beforehand, a search is
-       performed using the truncating "vsnprintf" call (to avoid buffer
-       overflows) on increasing potential sizes for the output result. */
-
-    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-        n = 1;
-        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-    }
-
-    for (;;) {
-        va_start (arglist, fmt);
-        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-        va_end (arglist);
-
-        buff->data[n] = (unsigned char) '\0';
-        buff->slen = (int) (strlen) ((char *) buff->data);
-
-        if (buff->slen < n) break;
-
-        if (r > n) n = r; else n += n;
-
-        if (BSTR_OK != balloc (buff, n + 2)) {
-            bdestroy (buff);
-            return BSTR_ERR;
-        }
-    }
-
-    r = bconcat (b, buff);
-    bdestroy (buff);
-    return r;
-}
-
-/*  int bassignformat (bstring b, const char * fmt, ...)
- *
- *  After the first parameter, it takes the same parameters as printf (), but 
- *  rather than outputting results to stdio, it outputs the results to 
- *  the bstring parameter b. Note that if there is an early generation of a 
- *  '\0' character, the bstring will be truncated to this end point.
- */
-int bassignformat (bstring b, const char * fmt, ...) {
-va_list arglist;
-bstring buff;
-int n, r;
-
-    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
-
-    /* Since the length is not determinable beforehand, a search is
-       performed using the truncating "vsnprintf" call (to avoid buffer
-       overflows) on increasing potential sizes for the output result. */
-
-    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-        n = 1;
-        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-    }
-
-    for (;;) {
-        va_start (arglist, fmt);
-        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-        va_end (arglist);
-
-        buff->data[n] = (unsigned char) '\0';
-        buff->slen = (int) (strlen) ((char *) buff->data);
-
-        if (buff->slen < n) break;
-
-        if (r > n) n = r; else n += n;
-
-        if (BSTR_OK != balloc (buff, n + 2)) {
-            bdestroy (buff);
-            return BSTR_ERR;
-        }
-    }
-
-    r = bassign (b, buff);
-    bdestroy (buff);
-    return r;
-}
-
-/*  bstring bformat (const char * fmt, ...)
- *
- *  Takes the same parameters as printf (), but rather than outputting results
- *  to stdio, it forms a bstring which contains what would have been output.
- *  Note that if there is an early generation of a '\0' character, the 
- *  bstring will be truncated to this end point.
- */
-bstring bformat (const char * fmt, ...) {
-va_list arglist;
-bstring buff;
-int n, r;
-
-    if (fmt == NULL) return NULL;
-
-    /* Since the length is not determinable beforehand, a search is
-       performed using the truncating "vsnprintf" call (to avoid buffer
-       overflows) on increasing potential sizes for the output result. */
-
-    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-        n = 1;
-        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
-    }
-
-    for (;;) {
-        va_start (arglist, fmt);
-        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-        va_end (arglist);
-
-        buff->data[n] = (unsigned char) '\0';
-        buff->slen = (int) (strlen) ((char *) buff->data);
-
-        if (buff->slen < n) break;
-
-        if (r > n) n = r; else n += n;
-
-        if (BSTR_OK != balloc (buff, n + 2)) {
-            bdestroy (buff);
-            return NULL;
-        }
-    }
-
-    return buff;
-}
-
-/*  int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
- *
- *  The bvcformata function formats data under control of the format control 
- *  string fmt and attempts to append the result to b.  The fmt parameter is 
- *  the same as that of the printf function.  The variable argument list is 
- *  replaced with arglist, which has been initialized by the va_start macro.
- *  The size of the output is upper bounded by count.  If the required output
- *  exceeds count, the string b is not augmented with any contents and a value
- *  below BSTR_ERR is returned.  If a value below -count is returned then it
- *  is recommended that the negative of this value be used as an update to the
- *  count in a subsequent pass.  On other errors, such as running out of 
- *  memory, parameter errors or numeric wrap around BSTR_ERR is returned.  
- *  BSTR_OK is returned when the output is successfully generated and 
- *  appended to b.
- *
- *  Note: There is no sanity checking of arglist, and this function is
- *  destructive of the contents of b from the b->slen point onward.  If there 
- *  is an early generation of a '\0' character, the bstring will be truncated 
- *  to this end point.
- */
-int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
-int n, r, l;
-
-    if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
-     || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
-
-    if (count > (n = b->slen + count) + 2) return BSTR_ERR;
-    if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
-
-    exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
-
-    /* Did the operation complete successfully within bounds? */
-
-    if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
-        b->slen = l;
-        return BSTR_OK;
-    }
-
-    /* Abort, since the buffer was not large enough.  The return value 
-       tries to help set what the retry length should be. */
-
-    b->data[b->slen] = '\0';
-    if (r > count+1) l = r; else {
-        l = count+count;
-        if (count > l) l = INT_MAX;
-    }
-    n = -l;
-    if (n > BSTR_ERR-1) n = BSTR_ERR-1;
-    return n;
-}
-
-#endif
diff --git a/bench/src/bstrlib.c b/bench/src/bstrlib.c
new file mode 120000
index 000000000..315a035e3
--- /dev/null
+++ b/bench/src/bstrlib.c
@@ -0,0 +1 @@
+../../src/bstrlib.c
\ No newline at end of file
diff --git a/bench/src/bstrlib_helper.c b/bench/src/bstrlib_helper.c
new file mode 120000
index 000000000..b50d44f15
--- /dev/null
+++ b/bench/src/bstrlib_helper.c
@@ -0,0 +1 @@
+../../src/bstrlib_helper.c
\ No newline at end of file
diff --git a/bench/src/ptt2asm.c b/bench/src/ptt2asm.c
new file mode 100644
index 000000000..6767e2951
--- /dev/null
+++ b/bench/src/ptt2asm.c
@@ -0,0 +1,767 @@
+/*
+ * =======================================================================================
+ *      Filename:  ptt2asm.c
+ *
+ *      Description:  The interface to dynamically load ptt files
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <libgen.h>
+#include <dirent.h>
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
+#include <test_types.h>
+
+
+#include <ptt2asm.h>
+
+#ifdef __x86_64
+#include <isa_x86-64.h>
+#endif
+#ifdef __i386__
+#include <isa_x86.h>
+#endif
+#ifdef __ARM_ARCH_7A__
+#include <isa_armv7.h>
+#endif
+#ifdef __ARM_ARCH_8A
+#include <isa_armv8.h>
+#endif
+#ifdef _ARCH_PPC
+#include <isa_ppc64.h>
+#endif
+
+static int registerMapLength(RegisterMap* map)
+{
+    int i = 0;
+    while (strlen(map[i].pattern) > 0)
+    {
+        i++;
+    }
+    return i;
+}
+
+static int registerMapMaxPattern(RegisterMap* map)
+{
+    int i = 0;
+    int max = 0;
+    while (strlen(map[i].pattern) > 0)
+    {
+        if (strlen(map[i].pattern) > max)
+            max = strlen(map[i].pattern);
+        i++;
+    }
+    return max;
+}
+
+static struct bstrList* read_ptt(bstring pttfile)
+{
+    int ret = 0;
+    FILE* fp = NULL;
+    char buf[BUFSIZ];
+    struct bstrList* l = NULL;
+
+    if (access(bdata(pttfile), R_OK))
+    {
+        return NULL;
+    }
+
+    bstring content = bfromcstr("");
+    fp = fopen(bdata(pttfile), "r");
+    if (fp == NULL) {
+        fprintf(stderr, "fopen(%s): errno=%d\n", pttfile, errno);
+        return NULL;
+    }
+    for (;;) {
+        /* Read another chunk */
+        ret = fread(buf, 1, sizeof(buf), fp);
+        if (ret < 0) {
+            fprintf(stderr, "fread(%p, 1, %lu, %p): %d, errno=%d\n", buf, sizeof(buf), fp, ret, errno);
+            return NULL;
+        }
+        else if (ret == 0) {
+            break;
+        }
+        bcatblk(content, buf, ret);
+    }
+    btrimws(content);
+
+    l = bsplit(content, '\n');
+    for (int i = 0; i < l->qty; i++)
+    {
+        btrimws(l->entry[i]);
+    }
+
+    bdestroy(content);
+    return l;
+}
+
+static int write_asm(bstring filename, struct bstrList* code)
+{
+    FILE* fp = NULL;
+    char newline = '\n';
+    size_t (*ownfwrite)(const void *ptr, size_t size, size_t nmemb, FILE *stream) = &fwrite;
+    fp = fopen(bdata(filename), "w");
+    if (fp)
+    {
+        for (int i = 0; i < code->qty; i++)
+        {
+            ownfwrite(bdata(code->entry[i]), 1, blength(code->entry[i]), fp);
+            ownfwrite(&newline, 1, sizeof(char), fp);
+        }
+        fclose(fp);
+        return 0;
+    }
+    return 1;
+}
+
+#define ANALYSE_PTT_GET_INT(line, pattern, variable) \
+    bstring tmp = bmidstr((line), blength((pattern))+1, blength((line))-blength((pattern))); \
+    btrimws(tmp); \
+    (variable) = ownatoi(bdata(tmp)); \
+    bdestroy(tmp); \
+
+static struct bstrList* analyse_ptt(bstring pttfile, TestCase** testcase)
+{
+    struct bstrList* ptt = NULL;
+    TestCase* test = NULL;
+    struct bstrList* code = NULL;
+    bstring bBYTES = bformat("BYTES");
+    bstring bFLOPS = bformat("FLOPS");
+    bstring bSTREAMS = bformat("STREAMS");
+    bstring bTYPE = bformat("TYPE");
+    bstring bTYPEDOUBLE = bformat("DOUBLE");
+    bstring bTYPESINGLE = bformat("SINGLE");
+    bstring bTYPEINT = bformat("INT");
+    bstring bDESC = bformat("DESC");
+    bstring bLOADS = bformat("LOADS");
+    bstring bSTORES = bformat("STORES");
+    bstring bLOADSTORES = bformat("LOADSTORES");
+    bstring bINSTCONST = bformat("INSTR_CONST");
+    bstring bINSTLOOP = bformat("INSTR_LOOP");
+    bstring bUOPS = bformat("UOPS");
+    bstring bBRANCHES = bformat("BRANCHES");
+    bstring bLOOP = bformat("LOOP");
+    int (*ownatoi)(const char*) = &atoi;
+
+    ptt = read_ptt(pttfile);
+
+    if (ptt && ptt->qty > 0)
+    {
+        test = malloc(sizeof(TestCase));
+        if (test)
+        {
+            test->loads = -1;
+            test->stores = -1;
+            test->loadstores = -1;
+            test->branches = -1;
+            test->instr_const = -1;
+            test->instr_loop = -1;
+            test->uops = -1;
+            code = bstrListCreate();
+            for (int i = 0; i < ptt->qty; i++)
+            {
+                if (bstrncmp(ptt->entry[i], bBYTES, blength(bBYTES)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bBYTES, test->bytes);
+                }
+                else if (bstrncmp(ptt->entry[i], bFLOPS, blength(bFLOPS)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bFLOPS, test->flops);
+                }
+                else if (bstrncmp(ptt->entry[i], bSTREAMS, blength(bSTREAMS)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bSTREAMS, test->streams);
+                }
+                else if (bstrncmp(ptt->entry[i], bLOADS, blength(bLOADS)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bLOADS, test->loads);
+                }
+                else if (bstrncmp(ptt->entry[i], bSTORES, blength(bSTORES)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bSTORES, test->stores);
+                }
+                else if (bstrncmp(ptt->entry[i], bLOADSTORES, blength(bLOADSTORES)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bLOADSTORES, test->loadstores);
+                }
+                else if (bstrncmp(ptt->entry[i], bINSTCONST, blength(bINSTCONST)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bINSTCONST, test->instr_const);
+                }
+                else if (bstrncmp(ptt->entry[i], bINSTLOOP, blength(bINSTLOOP)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bINSTLOOP, test->instr_loop);
+                }
+                else if (bstrncmp(ptt->entry[i], bUOPS, blength(bUOPS)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bUOPS, test->uops);
+                }
+                else if (bstrncmp(ptt->entry[i], bBRANCHES, blength(bBRANCHES)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bBRANCHES, test->branches);
+                }
+                else if (bstrncmp(ptt->entry[i], bLOOP, blength(bLOOP)) == BSTR_OK)
+                {
+                    ANALYSE_PTT_GET_INT(ptt->entry[i], bLOOP, test->stride);
+                    bstrListAdd(code, ptt->entry[i]);
+                }
+                else if (bstrncmp(ptt->entry[i], bDESC, blength(bDESC)) == BSTR_OK)
+                {
+                    test->desc = malloc((blength(ptt->entry[i])+2)*sizeof(char));
+                    if (test->desc)
+                    {
+                        int ret = snprintf(test->desc, blength(ptt->entry[i])+1, "%s", bdataofs(ptt->entry[i], blength(bDESC)+1));
+                        if (ret > 0)
+                        {
+                            test->desc[ret] = '\0';
+                        }
+                    }
+                }
+                else if (bstrncmp(ptt->entry[i], bTYPE, blength(bTYPE)) == BSTR_OK)
+                {
+                    bstring btype = bmidstr(ptt->entry[i], blength(bTYPE)+1, blength(ptt->entry[i])-blength(bTYPE));
+                    btrimws(btype);
+                    if (bstrncmp(btype, bTYPEDOUBLE, blength(bTYPEDOUBLE)) == BSTR_OK)
+                    {
+                        test->type = DOUBLE;
+                    }
+                    else if (bstrncmp(btype, bTYPESINGLE, blength(bTYPESINGLE)) == BSTR_OK)
+                    {
+                        test->type = SINGLE;
+                    }
+                    else if (bstrncmp(btype, bTYPEINT, blength(bTYPEINT)) == BSTR_OK)
+                    {
+                        test->type = INT;
+                    }
+                    else
+                    {
+                        fprintf(stderr, "Failed to determine type of benchmark\n");
+                        bdestroy(btype);
+                        bstrListDestroy(code);
+                        free(test);
+                        test = NULL;
+                        code = NULL;
+                        break;
+                    }
+                    bdestroy(btype);
+                }
+                else
+                {
+                    bstrListAdd(code, ptt->entry[i]);
+                }
+            }
+            *testcase = test;
+        }
+        bstrListDestroy(ptt);
+    }
+
+    bdestroy(bBYTES);
+    bdestroy(bFLOPS);
+    bdestroy(bSTREAMS);
+    bdestroy(bTYPE);
+    bdestroy(bTYPEDOUBLE);
+    bdestroy(bTYPESINGLE);
+    bdestroy(bTYPEINT);
+    bdestroy(bDESC);
+    bdestroy(bLOADS);
+    bdestroy(bSTORES);
+    bdestroy(bLOADSTORES);
+    bdestroy(bINSTCONST);
+    bdestroy(bINSTLOOP);
+    bdestroy(bUOPS);
+    bdestroy(bBRANCHES);
+    bdestroy(bLOOP);
+    return code;
+}
+
+static int set_testname(char *pttfile, TestCase* testcase)
+{
+    if ((!testcase)||(!pttfile))
+    {
+        return -EINVAL;
+    }
+    bstring ptt = bfromcstr(basename(pttfile));
+    int dot = bstrrchrp(ptt, '.', blength(ptt)-1);
+    btrunc(ptt, dot);
+    testcase->name = malloc((blength(ptt)+2) * sizeof(char));
+    int ret = snprintf(testcase->name, blength(ptt)+1, "%s", bdata(ptt));
+    if (ret > 0)
+    {
+        testcase->name[ret] = '\0';
+    }
+    bdestroy(ptt);
+    return 0;
+}
+
+
+static struct bstrList* parse_asm(TestCase* testcase, struct bstrList* input)
+{
+    struct bstrList* output = NULL;
+    if (testcase && input)
+    {
+
+        struct bstrList* pre = bstrListCreate();
+        struct bstrList* loop = bstrListCreate();
+        int got_loop = 0;
+        bstring bLOOP = bformat("LOOP");
+        int step = testcase->stride;
+
+        for (int i = 0; i < input->qty; i++)
+        {
+            if (bstrncmp(input->entry[i], bLOOP, blength(bLOOP)) == BSTR_OK)
+            {
+                got_loop = 1;
+                continue;
+            }
+            if (!got_loop)
+            {
+                bstrListAdd(pre, input->entry[i]);
+            }
+            else
+            {
+                bstrListAdd(loop, input->entry[i]);
+            }
+        }
+        bdestroy(bLOOP);
+
+        output = bstrListCreate();
+
+        header(output, testcase->name);
+
+        for (int i = 0; i < pre->qty; i++)
+        {
+            bstrListAdd(output, pre->entry[i]);
+        }
+        loopheader(output, "1", step);
+        for (int i = 0; i < loop->qty; i++)
+        {
+            bstrListAdd(output, loop->entry[i]);
+        }
+        loopfooter(output, "1", step);
+
+        footer(output, testcase->name);
+
+        bstrListDestroy(pre);
+        bstrListDestroy(loop);
+    }
+    return output;
+}
+
+static int searchreplace(bstring line, RegisterMap* map)
+{
+    int maxlen = registerMapMaxPattern(map);
+    int size = registerMapLength(map);
+    for (int s = maxlen; s>= 1; s--)
+    {
+        int c = 0;
+        for (int j = 0; j < size; j++)
+        {
+            if (strlen(map[j].pattern) == s)
+            {
+                bstring pat = bfromcstr(map[j].pattern);
+                bstring reg = bfromcstr(map[j].reg);
+                bfindreplace(line, pat, reg, 0);
+                bdestroy(pat);
+                bdestroy(reg);
+                c++;
+            }
+        }
+        if (c == 0)
+        {
+            break;
+        }
+    }
+    return 0;
+}
+
+static int prepare_code(struct bstrList* code)
+{
+    if (code)
+    {
+        for (int i = 0; i < code->qty; i++)
+        {
+            searchreplace(code->entry[i], StreamPatterns);
+        }
+        for (int i = 0; i < code->qty; i++)
+        {
+            searchreplace(code->entry[i], Registers);
+        }
+        for (int i = 0; i < code->qty; i++)
+        {
+            searchreplace(code->entry[i], Arguments);
+        }
+        for (int i = 0; i < code->qty; i++)
+        {
+            bstring pat = bfromcstr(Sptr.pattern);
+            bstring reg = bfromcstr(Sptr.reg);
+            bfindreplace(code->entry[i], pat, reg, 0);
+            bdestroy(pat);
+            bdestroy(reg);
+        }
+        for (int i = 0; i < code->qty; i++)
+        {
+            bstring pat = bfromcstr(Bptr.pattern);
+            bstring reg = bfromcstr(Bptr.reg);
+            bfindreplace(code->entry[i], pat, reg, 0);
+            bdestroy(pat);
+            bdestroy(reg);
+        }
+    }
+    return 0;
+}
+
+
+struct bstrList* dynbench_getall()
+{
+    int totalgroups = 0;
+    struct bstrList* list = NULL;
+    DIR *dp = NULL;
+    struct dirent *ep = NULL;
+    DIR * (*ownopendir)(const char* folder) = &opendir;
+    int (*ownaccess)(const char*, int) = &access;
+
+    bstring path = bformat("%s/.likwid/bench/%s", getenv("HOME"), ARCHNAME);
+
+    if (!ownaccess(bdata(path), R_OK|X_OK))
+    {
+        dp = ownopendir(bdata(path));
+        if (dp != NULL)
+        {
+            while (ep = readdir(dp))
+            {
+                if (strncmp(&(ep->d_name[strlen(ep->d_name)-4]), ".ptt", 4) == 0)
+                {
+                    if (!list) list = bstrListCreate();
+                    totalgroups++;
+                    bstring dname = bfromcstr(ep->d_name);
+                    btrunc(dname, blength(dname)-4);
+                    bstrListAdd(list, dname);
+                    bdestroy(dname);
+                }
+            }
+            closedir(dp);
+        }
+        else
+        {
+            fprintf(stderr, "Failed to enter folder %s\n", bdata(path));
+        }
+    }
+    bdestroy(path);
+    return list;
+}
+
+
+static bstring get_compiler(bstring candidates)
+{
+    bstring compiler = NULL;
+    bstring path = bfromcstr(getenv("PATH"));
+    struct bstrList *plist = NULL;
+    struct bstrList *clist = NULL;
+    int (*ownaccess)(const char*, int) = access;
+
+    plist = bsplit(path, ':');
+    clist = bsplit(candidates, ',');
+
+    for (int i = 0; i < plist->qty && (!compiler); i++)
+    {
+        for (int j = 0; j < clist->qty && (!compiler); j++)
+        {
+            bstring tmp = bformat("%s/%s", bdata(plist->entry[i]), bdata(clist->entry[j]));
+            if (!ownaccess(bdata(tmp), R_OK|X_OK))
+            {
+                compiler = bstrcpy(tmp);
+            }
+            bdestroy(tmp);
+        }
+    }
+    bdestroy(path);
+    bstrListDestroy(plist);
+    bstrListDestroy(clist);
+    return compiler;
+}
+
+static int compile_file(bstring compiler, bstring flags, bstring asmfile, bstring objfile)
+{
+    if (blength(compiler) == 0 || blength(asmfile) == 0)
+        return -1;
+    char buf[1024];
+    bstring bstdout = bfromcstr("");
+
+
+    bstring cmd = bformat("%s %s %s -o %s", bdata(compiler), bdata(flags), bdata(asmfile), bdata(objfile));
+
+    FILE * fp = popen(bdata(cmd), "r");
+    if (fp)
+    {
+        for (;;) {
+            /* Read another chunk */
+            int ret = fread(buf, 1, sizeof(buf), fp);
+            if (ret < 0) {
+                fprintf(stderr, "fread(%p, 1, %lu, %p): %d, errno=%d\n", buf, sizeof(buf), fp, ret, errno);
+                bdestroy(cmd);
+                bdestroy(bstdout);
+                return -1;
+            }
+            else if (ret == 0) {
+                break;
+            }
+            bcatblk(bstdout, buf, ret);
+        }
+        if (blength(bstdout) > 0)
+        {
+            fprintf(stderr, "%s\n", bdata(bstdout));
+        }
+        pclose(fp);
+    }
+    bdestroy(cmd);
+    bdestroy(bstdout);
+
+    return 0;
+}
+
+
+static int open_function(bstring location, TestCase *testcase)
+{
+    void* handle;
+    char *error;
+    void* (*owndlsym)(void*, const char*) = dlsym;
+
+    dlerror();
+    testcase->dlhandle = dlopen(bdata(location), RTLD_LAZY);
+    if (!testcase->dlhandle) {
+        fprintf(stderr, "Error opening location %s: %s\n", bdata(location), dlerror());
+        return -1;
+    }
+    dlerror();
+    testcase->kernel = owndlsym(testcase->dlhandle, testcase->name);
+    if ((error = dlerror()) != NULL)  {
+        dlclose(testcase->dlhandle);
+        fprintf(stderr, "Error opening function %s: %s\n", testcase->name, error);
+        return -1;
+    }
+    dlerror();
+
+    return 0;
+}
+
+
+int dynbench_test(bstring testname)
+{
+    int exist = 0;
+    char* home = getenv("HOME");
+    if (!home)
+    {
+        fprintf(stderr, "Failed to get $HOME from environment\n");
+        return exist;
+    }
+    bstring path = bformat("%s/.likwid/bench/%s/%s.ptt", home, ARCHNAME, bdata(testname));
+    if (!access(bdata(path), R_OK))
+    {
+        exist = 1;
+    }
+    bdestroy(path);
+    return exist;
+}
+
+int dynbench_load(bstring testname, TestCase **testcase, char* tmpfolder, char *compilers, char* compileflags)
+{
+    int err = -1;
+    TestCase *test = NULL;
+    char* home = getenv("HOME");
+    if (!home)
+    {
+        fprintf(stderr, "Failed to get $HOME from environment\n");
+        return err;
+    }
+    bstring pttfile = bformat("%s/.likwid/bench/%s/%s.ptt", home, ARCHNAME, bdata(testname));
+    if (!access(bdata(pttfile), R_OK))
+    {
+        struct bstrList* code = analyse_ptt(pttfile, &test);
+        if (code && test)
+        {
+            test->dlhandle = NULL;
+            test->kernel = NULL;
+            test->name = malloc((blength(testname)+2) * sizeof(char));
+            if (test->name)
+            {
+                int ret = snprintf(test->name, blength(testname)+1, "%s", bdata(testname));
+                if (ret > 0)
+                {
+                    test->name[ret] = '\0';
+                }
+                if (tmpfolder && compilers)
+                {
+                    pid_t pid = getpid();
+                    bstring buildfolder = bformat("%s/%ld", tmpfolder, pid);
+                    if (mkdir(bdata(buildfolder), 0700) == 0)
+                    {
+                        int asm_written = 0;
+                        bstring asmfile = bformat("%s/%ld/%s.S", tmpfolder , pid, bdata(testname));
+
+                        struct bstrList* asmb = parse_asm(test, code);
+                        if (asmb)
+                        {
+                            prepare_code(asmb);
+                            if (write_asm(asmfile, asmb) != 0)
+                            {
+                                fprintf(stderr, "Failed to write assembly to file %s\n", bdata(asmfile));
+                            }
+                            else
+                            {
+                                asm_written = 1;
+                            }
+                            bstrListDestroy(asmb);
+                        }
+                        else
+                        {
+                            fprintf(stderr, "Cannot parse assembly\n");
+                        }
+
+                        bstring candidates = bfromcstr(compilers);
+                        bstring compiler = get_compiler(candidates);
+                        if (asm_written && compiler)
+                        {
+                            int cret = 0;
+                            bstring cflags;
+                            if (compileflags)
+                            {
+                                cflags = bfromcstr(compileflags);
+                            }
+                            else
+                            {
+                                cflags = bfromcstr("");
+                            }
+                            bstring objfile = bformat("%s/%ld/%s.o", tmpfolder , pid, bdata(testname));
+                            cret = compile_file(compiler, cflags, asmfile, objfile);
+                            if (cret == 0)
+                            {
+                                cret = open_function(objfile, test);
+                                if (cret == 0)
+                                {
+                                    err = 0;
+                                    *testcase = test;
+                                }
+                                else
+                                {
+                                    fprintf(stderr, "Cannot load function %s from %s\n", bdata(testname), bdata(objfile));
+                                }
+                            }
+                            else
+                            {
+                                fprintf(stderr, "Cannot compile file %s to %s\n", bdata(asmfile), bdata(objfile));
+                            }
+                            bdestroy(cflags);
+                            bdestroy(objfile);
+                        }
+                        else
+                        {
+                            fprintf(stderr, "Cannot find any compiler %s\n", bdata(buildfolder));
+                        }
+                        bdestroy(candidates);
+                        bdestroy(compiler);
+                        bdestroy(asmfile);
+
+
+                    }
+                    else
+                    {
+                        fprintf(stderr, "Cannot create temporary directory %s\n", bdata(buildfolder));
+                        err = errno;
+                    }
+                    bdestroy(buildfolder);
+                }
+                else
+                {
+                    err = 0;
+                    *testcase = test;
+                }
+            }
+            else
+            {
+                fprintf(stderr, "Failed to allocate space for the testname\n");
+            }
+            bstrListDestroy(code);
+
+        }
+        else
+        {
+            fprintf(stderr, "Cannot read ptt file %s\n", bdata(pttfile));
+        }
+
+    }
+    else
+    {
+        fprintf(stderr, "Cannot open ptt file %s\n", bdata(pttfile));
+    }
+    bdestroy(pttfile);
+
+    return err;
+}
+
+int dynbench_close(TestCase* testcase, char* tmpfolder)
+{
+    if (testcase)
+    {
+        if (testcase->dlhandle)
+        {
+            dlclose(testcase->dlhandle);
+            testcase->dlhandle = NULL;
+            testcase->kernel = NULL;
+        }
+        if (tmpfolder)
+        {
+            pid_t pid = getpid();
+
+            bstring buildfolder = bformat("%s/%ld", tmpfolder, pid);
+            bstring asmfile = bformat("%s/%s.S", bdata(buildfolder), testcase->name);
+            bstring objfile = bformat("%s/%s.o", bdata(buildfolder), testcase->name);
+
+            if (!access(bdata(asmfile), R_OK)) unlink(bdata(asmfile));
+            if (!access(bdata(objfile), R_OK)) unlink(bdata(objfile));
+            if (!access(bdata(buildfolder), R_OK)) rmdir(bdata(buildfolder));
+
+            bdestroy(asmfile);
+            bdestroy(objfile);
+            bdestroy(buildfolder);
+        }
+        free(testcase->name);
+        testcase->name = NULL;
+        free(testcase->desc);
+        testcase->desc = NULL;
+        free(testcase);
+        testcase = NULL;
+    }
+    return 0;
+}
diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c
index 017d4b617..0f7949d5d 100644
--- a/bench/src/strUtil.c
+++ b/bench/src/strUtil.c
@@ -225,6 +225,11 @@ parse_streams(Workgroup* group, const_bstring str, int numberOfStreams)
 {
     struct bstrList* tokens;
     struct bstrList* subtokens;
+    if (group->init_per_thread)
+    {
+        fprintf(stderr, "Error: Cannot place stream in different stream when initialization per thread is selected.\n");
+        return -1;
+    }
     tokens = bsplit(str,',');
 
     if (tokens->qty < numberOfStreams)
@@ -297,8 +302,13 @@ bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int number
             bstrListDestroy(tokens);
             return 1;
         }
-        parse_streams(group, tokens->entry[1], numberOfStreams);
+        parseStreams = parse_streams(group, tokens->entry[1], numberOfStreams);
         bdestroy(domain);
+        if (parseStreams)
+        {
+            bstrListDestroy(tokens);
+            return parseStreams;
+        }
     }
     else if (tokens->qty == 1)
     {
diff --git a/bench/src/threads.c b/bench/src/threads.c
index 8e99a77e6..b6b4f8ffa 100644
--- a/bench/src/threads.c
+++ b/bench/src/threads.c
@@ -37,6 +37,7 @@
 
 #include <errno.h>
 #include <threads.h>
+#include <strUtil.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
@@ -135,22 +136,12 @@ threads_create(void *(*startRoutine)(void*))
 }
 
 void
-threads_createGroups(int numberOfGroups)
+threads_createGroups(int numberOfGroups, Workgroup *groups)
 {
     int i;
     int j;
-    int numThreadsPerGroup;
     int globalId = 0;
 
-    if (numThreads % numberOfGroups)
-    {
-        fprintf(stderr, "ERROR: Not enough threads %d to create %d groups\n",numThreads,numberOfGroups);
-    }
-    else
-    {
-        numThreadsPerGroup = numThreads / numberOfGroups;
-    }
-
     threads_groups = (ThreadGroup*) malloc(numberOfGroups * sizeof(ThreadGroup));
     if (!threads_groups)
     {
@@ -160,20 +151,20 @@ threads_createGroups(int numberOfGroups)
 
     for (i = 0; i < numberOfGroups; i++)
     {
-        threads_groups[i].numberOfThreads = numThreadsPerGroup;
-        threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup * sizeof(int));
+        threads_groups[i].numberOfThreads = groups[i].numberOfThreads;
+        threads_groups[i].threadIds = (int*) malloc(threads_groups[i].numberOfThreads * sizeof(int));
         if (!threads_groups[i].threadIds)
         {
             fprintf(stderr, "ERROR: Cannot allocate threadID list for thread groups - %s\n", strerror(errno));
             exit(EXIT_FAILURE);
         }
 
-        for (j = 0; j < numThreadsPerGroup; j++)
+        for (j = 0; j < threads_groups[i].numberOfThreads; j++)
         {
             threads_data[globalId].threadId = j;
             threads_data[globalId].groupId = i;
             threads_data[globalId].numberOfGroups = numberOfGroups;
-            threads_data[globalId].numberOfThreads = numThreadsPerGroup;
+            threads_data[globalId].numberOfThreads = threads_groups[i].numberOfThreads;
             threads_groups[i].threadIds[j] = globalId++;
         }
     }
diff --git a/bench/x86-64/stream_mem_avx512.ptt b/bench/x86-64/stream_mem_avx512.ptt
new file mode 100644
index 000000000..b72ddedd8
--- /dev/null
+++ b/bench/x86-64/stream_mem_avx512.ptt
@@ -0,0 +1,29 @@
+STREAMS 3
+TYPE DOUBLE
+FLOPS 2
+BYTES 24
+DESC Double-precision stream triad A(i) = B(i)*c + C(i), uses AVX-512 and non-temporal stores
+LOADS 2
+STORES 1
+INSTR_CONST 17
+INSTR_LOOP 19
+UOPS 26
+vmovapd zmm5, [rip+SCALAR]
+LOOP 32
+vmovapd    zmm1, [STR1 + GPR1*8]
+vmovapd    zmm2, [STR1 + GPR1*8+64]
+vmovapd    zmm3, [STR1 + GPR1*8+128]
+vmovapd    zmm4, [STR1 + GPR1*8+192]
+vmulpd     zmm1, zmm1, zmm5
+vaddpd     zmm1, zmm1, [STR2 + GPR1*8]
+vmulpd     zmm2, zmm2, zmm5
+vaddpd     zmm2, zmm2, [STR2 + GPR1*8+64]
+vmulpd     zmm3, zmm3, zmm5
+vaddpd     zmm3, zmm3, [STR2 + GPR1*8+128]
+vmulpd     zmm4, zmm4, zmm5
+vaddpd     zmm4, zmm4, [STR2 + GPR1*8+192]
+vmovntpd    [STR0 + GPR1*8]   , zmm1
+vmovntpd    [STR0 + GPR1*8+64], zmm2
+vmovntpd    [STR0 + GPR1*8+128], zmm3
+vmovntpd    [STR0 + GPR1*8+192], zmm4
+
diff --git a/config.mk b/config.mk
index 82d2969a9..b566e22bf 100644
--- a/config.mk
+++ b/config.mk
@@ -6,7 +6,8 @@
 
 # Please have a look in INSTALL and the WIKI for details on
 # configuration options setup steps.
-# supported: GCC, CLANG, ICC, MIC (ICC), GCCX86 (for 32bit systems)
+# Supported: GCC, CLANG, ICC, MIC (ICC), GCCX86 (for 32bit systems)
+# GCCARMv8, GCCARMv7 and GCCPOWER
 COMPILER = GCC#NO SPACE
 
 # Path were to install likwid
@@ -57,16 +58,25 @@ INSTALLED_PREFIX ?= $(PREFIX)#NO SPACE
 INSTALLED_BINPREFIX = $(INSTALLED_PREFIX)/bin#NO SPACE
 INSTALLED_LIBPREFIX = $(INSTALLED_PREFIX)/lib#NO SPACE
 
+# Build the accessDaemon. Have a look in the WIKI for details.
+BUILDDAEMON = true#NO SPACE
 # For the daemon based secure msr/pci access configure
 # the absolute path to the msr daemon executable.
 ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
 INSTALLED_ACCESSDAEMON = $(INSTALLED_PREFIX)/sbin/likwid-accessD#NO SPACE
 
-# Build the accessDaemon. Have a look in the WIKI for details.
-BUILDDAEMON = true#NO SPACE
 # Build the setFrequencies daemon to allow users setting the CPU and Uncore
 # frequency
 BUILDFREQ = true#NO SPACE
+# Paths for frequencie deaemon after installation
+FREQDAEMON = $(PREFIX)/sbin/likwid-setFreq#NO SPACE
+INSTALLED_FREQDAEMON = $(INSTALLED_PREFIX)/sbin/likwid-setFreq#NO SPACE
+
+# Build the appDaemon. It's not really a daemon but an LD_PRELOAD library
+# It is required to get access to the application context.
+BUILDAPPDAEMON=true
+APPDAEMON = $(PREFIX)/lib/likwid-appDaemon.so#NO SPACE
+INSTALLED_APPDAEMON = $(INSTALLED_PREFIX)/lib/likwid-appDaemon.so#NO SPACE
 
 # chown installed tools to this user/group
 # if you change anything here, make sure that the user/group can access
diff --git a/doc/likwid-agent.1 b/doc/likwid-agent.1
deleted file mode 100644
index f50dbca45..000000000
--- a/doc/likwid-agent.1
+++ /dev/null
@@ -1,94 +0,0 @@
-.TH LIKWID-AGENT 1 <DATE> likwid\-VERSION
-.SH NAME
-likwid-agent \- monitoring daemon for hardware performance counters
-.SH SYNOPSIS
-.B likwid-agent <config_file>
-.SH DESCRIPTION
-.B likwid-agent
-is a daemon application that uses
-.B likwid-perfctr(1)
-to measure hardware performance counters. The basic configuration is in a global configuration file. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output to much data, the data can be further filtered or aggregated.
-.B likwid-agent
-provides multiple store backends like logfiles, RRD (Round Robin Database) or gmetric (Ganglia Monitoring System).
-
-.SH CONFIG FILE
-The global configuration file has the following options:
-.TP
-.B GROUPPATH <path>
-Path to the group files containing event set and output defintitions. See section
-.B GROUP FILES
-for information.
-.TP
-.B EVENTSET <group1> <group2> ...
-Space separated list of groups (without .txt) that should be monitored.
-.TP
-.B DURATION <time>
-Measurement duration in seconds.
-.TP
-.B LOGPATH <path>
-Specify a logfile.
-.TP
-.B GMETRIC <True/False>
-Activates the output to gmetric.
-.TP
-.B GMETRICPATH <path>
-Set path to the gmetric executable.
-.TP
-.B GMETRICCONFIG <path>
-Set a custom configuration file is needed for gmetric.
-.TP
-.B RRD <True/False>
-Activates the output to RRD files (Round Robin Database).
-.TP
-.B RRDPATH <path>
-Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.
-.TP
-.B SYSLOG <True/False>
-Activates the output to system log using logger.
-.TP
-.B SYSLOGPRIO <prio>
-Set the priority string for logger, default is 'local0.notice'.
-
-.SH GROUP FILES
-The group files are adapted performance group files as used by
-.B likwid-perfctr(1).
-This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is
-.B <GROUPPATH>/<SHORT_ARCH_NAME>/
-with
-.B <SHORT_ARCH_NAME>
-similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
-.TP
-.B SHORT <string>
-A short descriptive information about the group.
-.TP
-.B EVENTSET
-.TP
-.B <counter1> <event1>
-.TP
-.B <counter2>:<option> <event2>
-Defintion of the eventset similar to the performance groups.
-.TP
-.B METRICS
-.TP
-.B <metricname> <formula>
-.TP
-.B <filter> <metricname> <formula>
-Defintion of the output metrics. The syntax follows the
-.B METRICS
-defintion of the performance groups as used by
-.B likwid-perfctr(1).
-If no function is set at the beginning of the line,
-.B <formula>
-is evaluated for every CPU and send to the output backends. The
-.B <metricname>
-gets the prefix "T<cpuid> ". To avoid writing to much data to the backends, the data can be reduced by
-.B <filter>.
-The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter sends only the data from the first CPU to the output backends commonly used for the measurement duration.
-
-
-.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl@googlemail.com>.
-.SH BUGS
-Report Bugs on <https://github.com/RRZE-HPC/likwid/issues>.
-.SH "SEE ALSO"
-likwid-perfctr(1), rrdtool(1), gmetric(1)
diff --git a/doc/likwid-bench.1 b/doc/likwid-bench.1
index 3a1d71917..33f8fae36 100644
--- a/doc/likwid-bench.1
+++ b/doc/likwid-bench.1
@@ -11,12 +11,16 @@ likwid-bench \- low-level benchmark suite and microbenchmarking framework
 .IR <min_time> ]
 .RB [ \-w
 .IR <workgroup_expression> ]
+.RB [ \-W
+.IR <workgroup_expression_short> ]
 .RB [ \-l
 .IR <testname> ]
 .RB [ \-d
 .IR <delimiter> ]
 .RB [ \-i
 .IR <iterations> ]
+.RB [ \-f
+.IR <filepath> ]
 .SH DESCRIPTION
 .B likwid-bench
 is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
@@ -29,7 +33,7 @@ as a wrapper to
 .B likwid-bench.
 This requires to build
 .B likwid-bench
-with instrumentation enabled in config.mk.
+with instrumentation enabled in config.mk. Benchmarks can be dynamically added when a proper ptt file is present at $HOME/.likwid/bench/<arch>/<testname>.ptt . The files are compiled to a .S file and compiled using either gcc, icc or pgcc (searched in $PATH). The default folder is /tmp/<PID>.
 .SH OPTIONS
 .TP
 .B \-\^h
@@ -50,13 +54,19 @@ The amount of iterations is determined using this value. Default: 1 second.
 Name of the benchmark code to run (mandatory).
 .TP
 .B \-\^w <workgroup_expression>
-Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory).
+Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory). First thread in thread domain initializes the stream.
+.TP
+.B \-\^W <workgroup_expression_short>
+Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory). Each thread in the workgroup initializes its own chunk of the stream.
 .TP
 .B \-\^l <testname>
 list properties of a benchmark code.
 .TP
 .B \-\^i <iterations>
 Set the number of iterations per thread (optional)
+.TP
+.B \-\^f <filepath>
+Filepath for the dynamic generation of benchmarks. Default /tmp/. <PID> is always attached
 
 .SH WORKGROUP SYNTAX
 
@@ -82,8 +92,9 @@ can be provided. Optionally for every stream (array, vector) the placement can b
 the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams can be aquired by the
 .B \-l
 option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be explicitly placed. Please refer to the Wiki pages on
-.B http://code.google.com/p/likwid/wiki/LikwidBench
+.B https://github.com/RRZE-HPC/likwid/wiki/Likwid-Bench
 for further details and examples on usage.
+With -W each thread initializes its own chunk of the streams but pleacement of the streams is deactivated.
 
 
 .SH EXAMPLE
@@ -97,7 +108,7 @@ benchmark on socket 0 (
 .TP
 .B likwid-bench -t copy -w S0:100kB
 .PP
-Since no 
+Since no
 .B <num_threads>
 is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads and the number of iterations is determined automatically.
 .IP 2. 4
diff --git a/doc/likwid-mpirun.1 b/doc/likwid-mpirun.1
index e3db441ba..54bbf59c7 100644
--- a/doc/likwid-mpirun.1
+++ b/doc/likwid-mpirun.1
@@ -18,7 +18,8 @@ likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
 .IR mpitype ]
 .RB [ \-g
 .IR eventset ]
-.RB [\-\-]
+.RB [\-\-mpiopts
+.IR opts ]
 .SH DESCRIPTION
 .B likwid-mpirun
 is a command line application that wraps the vendor-specific mpirun tool and adds calls to
@@ -62,7 +63,7 @@ and
 .TP
 .B \-\^mpi <mpitype>
 specifies the MPI implementation that should be used by the wrapper. Possible values are
-.B intelmpi, openmpi
+.B intelmpi, openmpi, slurm
 and
 .B mvapich2
 .TP
@@ -72,8 +73,11 @@ activates the Marker API for the executed MPI processes
 .B \-\^O
 prints output in CSV not ASCII tables
 .TP
+.B \-\-\^mpiopts <opts>
+adds options for the underlying MPI implementation. Please use proper quoting.
+.TP
 .B \-\-
-stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
+stops parsing arguments for likwid-mpirun.
 
 .SH EXAMPLE
 .IP 1. 4
diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1
index 30ecdfb12..ebfc336c6 100644
--- a/doc/likwid-perfctr.1
+++ b/doc/likwid-perfctr.1
@@ -2,7 +2,7 @@
 .SH NAME
 likwid-perfctr \- configure and read out hardware performance counters on x86 CPUs
 .SH SYNOPSIS
-.B likwid-perfctr 
+.B likwid-perfctr
 .RB [\-vhHmaiefO]
 .RB [ \-c
 .IR core_list ]
@@ -61,7 +61,7 @@ print available performance groups for current processor, then exit.
 print available counters and performance events of current processor.
 .TP
 .B \-\^o, \-\-\^output <filename>
-store all ouput to a file instead of stdout. For the filename the following placeholders are supported: 
+store all ouput to a file instead of stdout. For the filename the following placeholders are supported:
 %j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h host name and %p for process pid.
 The placeholders must be separated by underscore as, e.g., -o test_%h_%p. You must specify a suffix to
 the filename. For txt the output is printed as is to the file. Other suffixes trigger a filter on the output.
@@ -76,11 +76,11 @@ for details).
 print cpuid information about processor and about Intel Performance Monitoring features, then exit.
 .TP
 .B \-\^c <cpu expression>
-specify a numerical list of processors. The list may contain multiple 
+specify a numerical list of processors. The list may contain multiple
 items, separated by comma, and ranges. For example 0,3,9-11.
 .TP
 .B \-\^C <cpu expression>
-specify a numerical list of processors. The list may contain multiple 
+specify a numerical list of processors. The list may contain multiple
 items, separated by comma, and ranges. For example 0,3,9-11. This variant will
 also pin the threads to the cores. Also logical numberings can be used.
 .TP
@@ -111,7 +111,7 @@ Print only events and corresponding counters matching <search_str>
 Always print statistics table
 
 .SH EXAMPLE
-Because 
+Because
 .B likwid-perfctr
 measures on processors and not single applications it is necessary to ensure
 that processes and threads are pinned to dedicated resources. You can either pin the application yourself
@@ -148,7 +148,7 @@ On Intel processors fixed events are measured on dedicated counters. These are
 .B INSTR_RETIRED_ANY
 and
 .B CPU_CLK_UNHALTED_CORE.
-If you configure these fixed counters, 
+If you configure these fixed counters,
 .B likwid-perfctr
 will calculate the run time and CPI metrics for your run.
 
@@ -162,12 +162,12 @@ Examples can be found in examples folder <INSTALLEDPREFIX>/share/likwid/examples
 The following code snippet shows the necessary calls:
 
 .nf
-#include <likwid.h>
+#include <likwid-marker.h>
 
 /* only one thread calls init */
 LIKWID_MARKER_INIT;
 
-/* Must be called by each thread the should 
+/* Must be called by each thread the should
  * perform measurements.
  * If you place it in the same parallel
  * region as LIKWID_MARKER_START, perform a
@@ -181,11 +181,11 @@ LIKWID_MARKER_THREADINIT;
  * of START and STOP calls. Call it once for each
  * thread in parallel environment.
  * Note: No whitespace characters are allowed in the region tags
- * This call is optional, START will do the same operations.
+ * This call is optional but RECOMMENDED, START will do the same operations.
  */
 LIKWID_MARKER_REGISTER("name");
 
-/* Start measurement 
+/* Start measurement
  * Note: No whitespace characters are allowed in the region tags
  */
 LIKWID_MARKER_START("name");
@@ -201,6 +201,19 @@ LIKWID_MARKER_STOP("name");
  */
 LIKWID_MARKER_SWITCH;
 
+/* If you want to get the data of a region inside your application
+ * nevents in an (int*) and used as input length of the events array. After the
+ * call, nevents contains the actual amount of events
+ * events is an array of doubles (double*), time is a pointer to double to
+ * retrieve the measured runtime of the region and count is a pointer to int
+ * and is filled with the call count of the region.
+ */
+LIKWID_MARKER_GET("name", nevents, events, time, count);
+
+/* If you want to reset the counts for a region
+ */
+LIKWID_MARKER_RESET("name");
+
 /* Finally */
 LIKWID_MARKER_CLOSE;
 .fi
@@ -219,7 +232,7 @@ mode is for custom event sets:
 For performance groups with metrics:
 .B <groupID> <numberOfMetrics> <numberOfThreads> <Timestamp> <Metric1_Thread1> <Metric2_Thread1> ... <Metric1_Thread2> ...<MetricN_ThreadM>
 
-For timeline mode there is a frontend application likwid-perfscope(1), which enables live plotting of selected events. Please be aware that with high frequencies (<100ms), the values differ from the real results but the behavior of them is valid. 
+For timeline mode there is a frontend application likwid-perfscope(1), which enables live plotting of selected events. Please be aware that with high frequencies (<100ms), the values differ from the real results but the behavior of them is valid.
 
 .IP 6. 4
 Using likwid in stethoscope mode:
diff --git a/doc/likwid-pin.1 b/doc/likwid-pin.1
index 62bf98303..b830fefc3 100644
--- a/doc/likwid-pin.1
+++ b/doc/likwid-pin.1
@@ -3,7 +3,7 @@
 likwid-pin \- pin a sequential or threaded application to dedicated processors
 .SH SYNOPSIS
 .B likwid-pin
-.RB [\-vhSpqi]
+.RB [\-vhSpqim]
 .RB [ \-V
 .IR <verbosity> ]
 .RB [ \-c/\-C
@@ -81,6 +81,9 @@ prints the available thread domains for logical pinning
 .B \-\^i
 set NUMA memory policy to interleave involving all NUMA nodes involved in pinning
 .TP
+.B \-\^m
+set NUMA memory policy to membind involving all NUMA nodes involved in pinning
+.TP
 .B \-\^d <delim>
 usable with
 .B \-\^p
diff --git a/ext/GOTCHA/COPYRIGHT b/ext/GOTCHA/COPYRIGHT
new file mode 100644
index 000000000..82c0160b8
--- /dev/null
+++ b/ext/GOTCHA/COPYRIGHT
@@ -0,0 +1,17 @@
+GOTCHA is developed by Lawrence Livermore National Security.
+The following copyrights apply:
+
+Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
+the Lawrence Livermore National Laboratory. Written by David Poliakoff 'poliakoff1 at llnl dot gov'
+and Matthew LeGendre 'legendre1 at llnl dot gov'. CODE-730558. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under 
+the terms of the GNU Lesser General Public License (as published by the Free Software
+Foundation) version 2.1 dated February 1999.  This program is distributed in 
+the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the 
+IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 
+the terms and conditions of the GNU Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public License along 
+with this program; if not, write to the Free Software Foundation, Inc., 59 
+Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+
diff --git a/ext/GOTCHA/LGPL b/ext/GOTCHA/LGPL
new file mode 100644
index 000000000..d7da6168f
--- /dev/null
+++ b/ext/GOTCHA/LGPL
@@ -0,0 +1,366 @@
+OUR NOTICE AND TERMS AND CONDITIONS OF THE GNU GENERAL PUBLIC LICENSE
+
+Our Preamble Notice
+
+A. This notice is required to be provided under our contract with the U.S.
+Department of Energy (DOE). This work was produced at the Lawrence Livermore
+National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
+
+B. Neither the United States Government nor Lawrence Livermore National Security,
+LLC nor any of their employees, makes any warranty, express or implied, or assumes
+any liability or responsibility for the accuracy, completeness, or usefulness of any
+information, apparatus, product, or process disclosed, or represents that its use would not
+infringe privately-owned rights.
+
+C. Also, reference herein to any specific commercial products, process, or services by trade
+name, trademark, manufacturer or otherwise does not necessarily constitute or imply its
+endorsement, recommendation, or favoring by the United States Government or Lawrence
+Livermore National Security, LLC. The views and opinions of authors expressed herein do
+not necessarily state or reflect those of the United States Government or Lawrence
+Livermore National Security, LLC, and shall not be used for advertising or product
+endorsement purposes.
+
+The precise terms and conditions for copying, distribution and modification follows.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
diff --git a/ext/GOTCHA/Makefile b/ext/GOTCHA/Makefile
new file mode 100644
index 000000000..4bc076740
--- /dev/null
+++ b/ext/GOTCHA/Makefile
@@ -0,0 +1,71 @@
+SRC_DIRS    = ./src
+MAKE_DIR   = ../../make
+
+#DO NOT EDIT BELOW
+
+include ../../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+CFLAGS    = -O2 -Wall -fPIC
+INCLUDES  = -I./include
+LIBS      = -lm -Wl,-E -ldl
+
+Q         ?= @
+ifeq ($(strip $(DEBUG)),true)
+DEBUG_FLAGS = -g
+else
+DEBUG_FLAGS =
+endif
+ifeq ($(strip $(COMPILER)),MIC)
+CFLAGS += -mmic
+LFLAGS += -mmic
+endif
+
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+
+VPATH     = $(SRC_DIRS)
+FILES     = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
+OBJ       = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+CPPFLAGS := $(filter-out -D_GNU_SOURCE,$(CPPFLAGS))
+
+
+LIBGOTCHA = $(shell basename $(TARGET_GOTCHA_LIB))
+INTERPRETER = lua
+
+.NOTPARALLEL: all
+
+all: $(BUILD_DIR) $(OBJ)  $(LIBGOTCHA)
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+$(STATIC_LIBGOTCHA): $(OBJ)
+	$(info "GOTCHA cannot be build statically in LIKWID")
+	#$(Q)${AR} -cq $(LIBGOTCHA) $(OBJ)
+
+$(SHARED_LIBGOTCHA): $(OBJ)
+	$(Q)$(CC) $(LFLAGS) -Wl,-soname,$(LIBGOTCHA).$(VERSION).$(RELEASE) -shared -fPIC -o $(LIBGOTCHA) $(OBJ) $(LIBS)
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean
+
+clean:
+	@rm -f $(TARGET) $(SHARED_LIBGOTCHA) $(STATIC_LIBGOTCHA) $(LIBGOTCHA).$(VERSION).$(RELEASE) $(LIBGOTCHA).$(VERSION)
+
+distclean: clean
+	@rm -f $(TARGET) $(SHARED_LIBGOTCHA) $(STATIC_LIBGOTCHA) $(LIBGOTCHA).$(VERSION).$(RELEASE) $(LIBGOTCHA).$(VERSION)
+	@rm -rf $(BUILD_DIR)
diff --git a/ext/GOTCHA/include/CMakeLists.txt b/ext/GOTCHA/include/CMakeLists.txt
new file mode 100644
index 000000000..a319f277b
--- /dev/null
+++ b/ext/GOTCHA/include/CMakeLists.txt
@@ -0,0 +1 @@
+install(DIRECTORY gotcha DESTINATION include FILES_MATCHING PATTERN *.h)
diff --git a/ext/GOTCHA/include/gotcha/Doxyfile b/ext/GOTCHA/include/gotcha/Doxyfile
new file mode 100644
index 000000000..728b2036b
--- /dev/null
+++ b/ext/GOTCHA/include/gotcha/Doxyfile
@@ -0,0 +1,1519 @@
+# Doxyfile 1.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           =
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses.
+# With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this tag.
+# The format is ext=language, where ext is a file extension, and language is one of
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will rougly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
+# doxygen. The layout file controls the global structure of the generated output files
+# in an output format independent way. The create the layout file that represents
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name
+# of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  =
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# If the HTML_TIMESTAMP tag is set to YES then the generated HTML
+# documentation will contain the timesstamp.
+
+HTML_TIMESTAMP         = NO
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
+# are set, an additional index file will be generated that can be used as input for
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          =
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
+# For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP)
+# there is already a search function so this one should typically
+# be disabled.
+
+SEARCHENGINE           = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/ext/GOTCHA/include/gotcha/gotcha.h b/ext/GOTCHA/include/gotcha/gotcha.h
new file mode 100644
index 000000000..30018a238
--- /dev/null
+++ b/ext/GOTCHA/include/gotcha/gotcha.h
@@ -0,0 +1,125 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+/*!
+ ******************************************************************************
+ *
+ * \file gotcha.h
+ *
+ * \brief   Header file containing the external gotcha interface
+ *
+ *          The intended use pattern is as follows
+ *
+ *					TODO ON-INTERFACE-SOLID: document the interface 
+ *                                   usage
+ *
+ ******************************************************************************
+ */
+#ifndef GOTCHA_H
+#define GOTCHA_H
+
+#include "gotcha/gotcha_types.h"
+
+#if defined(__cplusplus) 
+extern "C" {
+#endif
+
+/*!
+ ******************************************************************************
+ * \def GOTCHA_MAKE_FUNCTION_PTR(name, ret_type, ...)
+ * \brief Makes a function pointer with a given name, return type, and
+ *        parameters
+ * \param name     The name of the function you want to get a pointer to
+ * \param ret_type The return type of the function you want a pointer to
+ * \param ...      A comma separated list of the types of the parameters
+ * 								 to the function you're getting a pointer to
+ ******************************************************************************
+ */
+
+#define GOTCHA_MAKE_FUNCTION_PTR(name, ret_type, ...) ret_type(*name)(__VA_ARGS__)
+
+#define GOTCHA_EXPORT __attribute__((__visibility__("default")))
+
+/*!
+ ******************************************************************************
+ *
+ * \fn enum gotcha_error_t gotcha_wrap(struct gotcha_binding_t* bindings, 
+ *                                     void** wrappers, void*** originals, 
+ *                                     int num_actions);
+ *
+ * \brief Makes GOTCHA wrap the functions picked in gotcha_prepare_symbols
+ *
+ * \param bindings    A list of bindings to wrap
+ * \param num_actions The number of items in the bindings table
+ * \param tool_name   A name you use to represent your tool when
+ *                    stacking multiple tools (currently unused).
+ *
+ ******************************************************************************
+ */
+
+GOTCHA_EXPORT enum gotcha_error_t gotcha_wrap(struct gotcha_binding_t* bindings, int num_actions, const char* tool_name);
+
+
+/*!
+ ******************************************************************************
+ *
+ * \fn enum gotcha_error_t gotcha_set_priority(const char *tool_name,
+ *                                             int value);
+ *
+ * \brief Set the tool priority, which controls how multiple tools stack
+ *        wrappings over the same functions.
+ *
+ * \param tool_name   The tool name to set the priority of
+ * \param priority    The new priority value for the tool.  Lower values
+ *                    are called innermost.
+ *
+ ******************************************************************************
+ */
+GOTCHA_EXPORT enum gotcha_error_t gotcha_set_priority(const char* tool_name, int priority);
+
+/*!
+ ******************************************************************************
+ *
+ * \fn enum gotcha_error_t gotcha_get_priority(const char *tool_name,
+ *                                             int *value);
+ *
+ * \brief Gets the tool priority, which controls how multiple tools stack
+ *        wrappings over the same functions.
+ *
+ * \param tool_name   The tool name to get the priority of
+ * \param num_actions Output parameters with the priority for the tool.
+ *
+ ******************************************************************************
+ */
+GOTCHA_EXPORT enum gotcha_error_t gotcha_get_priority(const char* tool_name, int *priority);
+
+/*!
+ ******************************************************************************
+ *
+ * \fn enum void* gotcha_get_wrappee(gotcha_wrappee_handle_t)
+ *
+ * \brief Given a GOTCHA wrapper's handle, returns the wrapped function for it to call
+ *
+ * \param handle The wrappee handle to return the function pointer for
+ *
+ ******************************************************************************
+ */
+GOTCHA_EXPORT void* gotcha_get_wrappee(gotcha_wrappee_handle_t handle);
+
+#if defined(__cplusplus) 
+}
+#endif
+
+#endif
diff --git a/ext/GOTCHA/include/gotcha/gotcha_types.h b/ext/GOTCHA/include/gotcha/gotcha_types.h
new file mode 100644
index 000000000..5e247f757
--- /dev/null
+++ b/ext/GOTCHA/include/gotcha/gotcha_types.h
@@ -0,0 +1,57 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*!
+ ******************************************************************************
+ *
+ * \file gotcha_types.h
+ *
+ * \brief   Header file containing the internal gotcha types
+ *
+ ******************************************************************************
+ */
+#ifndef GOTCHA_TYPES_H
+#define GOTCHA_TYPES_H
+
+#if defined(__cplusplus) 
+extern "C" {
+#endif
+
+typedef void* gotcha_wrappee_handle_t;
+
+/*!
+ * The representation of a Gotcha action
+ * as it passes through the pipeline
+ */
+typedef struct gotcha_binding_t {
+  const char* name;                                //!< The name of the function being wrapped
+  void* wrapper_pointer;                           //!< A pointer to the wrapper function
+  gotcha_wrappee_handle_t* function_handle;         //!< A pointer to the function being wrapped
+}gotcha_binding_t;
+
+/*!
+ * The representation of an error (or success) of a Gotcha action
+ */
+typedef enum gotcha_error_t {
+  GOTCHA_SUCCESS = 0,          //!< The call succeeded
+  GOTCHA_FUNCTION_NOT_FOUND,   //!< The call looked up a function which could not be found
+  GOTCHA_INTERNAL,             //!< Internal gotcha error
+  GOTCHA_INVALID_TOOL          //!< Invalid tool name
+}gotcha_error_t;
+
+#if defined(__cplusplus) 
+}
+#endif
+
+#endif
diff --git a/ext/GOTCHA/src/CMakeLists.txt b/ext/GOTCHA/src/CMakeLists.txt
new file mode 100644
index 000000000..1132a4ca7
--- /dev/null
+++ b/ext/GOTCHA/src/CMakeLists.txt
@@ -0,0 +1,36 @@
+include(GNUInstallDirs)
+set(GOTCHA_SOURCES
+  gotcha_utils.c 
+  gotcha.c 
+  gotcha_auxv.c 
+  libc_wrappers.c 
+  elf_ops.c 
+  hash.c 
+  tool.c 
+  library_filters.c
+  gotcha_dl.c
+  translations.c
+)
+
+add_library(gotcha SHARED ${GOTCHA_SOURCES})
+
+
+set_target_properties(gotcha PROPERTIES SOVERSION ${LIBTOOL_INTERFACE})
+set_target_properties(gotcha PROPERTIES VERSION "${LIBTOOL_INTERFACE}.${LIBTOOL_REVISION}.${LIBTOOL_AGE}")
+set_target_properties(gotcha PROPERTIES COMPILE_FLAGS "-fvisibility=${DEFAULT_SYMBOL_VISIBILITY}")
+set_target_properties(gotcha PROPERTIES LINK_FLAGS "-fvisibility=${DEFAULT_SYMBOL_VISIBILITY}")
+if(GOTCHA_ENABLE_TESTS)
+add_library(gotcha_no_libc SHARED ${GOTCHA_SOURCES})
+set_target_properties(gotcha_no_libc PROPERTIES SOVERSION ${LIBTOOL_INTERFACE})
+set_target_properties(gotcha_no_libc PROPERTIES VERSION "${LIBTOOL_INTERFACE}.${LIBTOOL_REVISION}.${LIBTOOL_AGE}")
+set_target_properties(gotcha_no_libc PROPERTIES COMPILE_DEFINITIONS "FORCE_NO_LIBC")
+set_target_properties(gotcha_no_libc PROPERTIES COMPILE_FLAGS "-fvisibility=${DEFAULT_SYMBOL_VISIBILITY}")
+set_target_properties(gotcha_no_libc PROPERTIES LINK_FLAGS "-fvisibility=${DEFAULT_SYMBOL_VISIBILITY}")
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+install(TARGETS gotcha EXPORT gotcha-targets DESTINATION ${CMAKE_INSTALL_LIBDIR}) 
+install(EXPORT gotcha-targets DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/gotcha)
+
+add_subdirectory(example)
diff --git a/ext/GOTCHA/src/elf_ops.c b/ext/GOTCHA/src/elf_ops.c
new file mode 100644
index 000000000..93297e306
--- /dev/null
+++ b/ext/GOTCHA/src/elf_ops.c
@@ -0,0 +1,94 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "elf_ops.h"
+#include "libc_wrappers.h"
+#include <elf.h>
+struct gnu_hash_header {
+   uint32_t nbuckets;   //!< The number of buckets to hash symbols into
+   uint32_t symndx;     //!< Index of the first symbol accessible via hashtable in the symbol table
+   uint32_t maskwords;  //!< Number of words in the hash table's bloom filter
+   uint32_t shift2;     //!< The bloom filter's shift count
+};
+
+static uint32_t gnu_hash_func(const char *str) {
+  uint32_t hash = 5381;
+  for (; *str != '\0'; str++) {
+    hash = hash * 33 + *str;
+  }
+  return hash;
+}
+
+signed long lookup_gnu_hash_symbol(const char *name, ElfW(Sym) * syms,
+                                   char *symnames,
+                                   void *sheader) {
+  uint32_t *buckets, *vals;
+  uint32_t hash_val;
+  uint32_t cur_sym, cur_sym_hashval;
+  struct gnu_hash_header *header = (struct gnu_hash_header *) (sheader);
+
+  buckets = (uint32_t *)(((unsigned char *)(header + 1)) +
+                         (header->maskwords * sizeof(ElfW(Addr))));
+  vals = buckets + header->nbuckets;
+
+  hash_val = gnu_hash_func(name);
+  cur_sym = buckets[hash_val % header->nbuckets];
+  if (cur_sym == 0) {
+    return -1;
+  }
+
+  hash_val &= ~1;
+  for (;;) {
+    cur_sym_hashval = vals[cur_sym - header->symndx];
+    if (((cur_sym_hashval & ~1) == hash_val) &&
+        (gotcha_strcmp(name, symnames + syms[cur_sym].st_name) == 0)) {
+      return (signed long)cur_sym;
+    }
+    if (cur_sym_hashval & 1) {
+      return -1;
+    }
+    cur_sym++;
+  }
+}
+
+static unsigned long elf_hash(const unsigned char *name) {
+  unsigned int h = 0, g;
+  while (*name != '\0') {
+    h = (h << 4) + *name++;
+    if ((g = h & 0xf0000000)) {
+      h ^= g >> 24;
+    }
+    h &= ~g;
+  }
+  return h;
+}
+
+signed long lookup_elf_hash_symbol(const char *name, ElfW(Sym) * syms,
+                                   char *symnames, ElfW(Word) * header) {
+  ElfW(Word) *nbucket = header + 0;
+  ElfW(Word) *buckets = header + 2;
+  ElfW(Word) *chains = buckets + *nbucket;
+
+  unsigned int x = elf_hash((const unsigned char *)name);
+  signed long y = (signed long)buckets[x % *nbucket];
+  while (y != STN_UNDEF) {
+    if (gotcha_strcmp(name, symnames + syms[y].st_name) == 0) {
+      return y;
+    }
+    y = chains[y];
+  }
+
+  return -1;
+}
diff --git a/ext/GOTCHA/src/elf_ops.h b/ext/GOTCHA/src/elf_ops.h
new file mode 100644
index 000000000..061b35fac
--- /dev/null
+++ b/ext/GOTCHA/src/elf_ops.h
@@ -0,0 +1,206 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free Software
+Foundation) version 2.1 dated February 1999.  This program is distributed in the
+hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED
+WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms
+and conditions of the GNU Lesser General Public License for more details.  You should
+have received a copy of the GNU Lesser General Public License along with this
+program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#if !defined(ELF_OPS_H_)
+#define ELF_OPS_H_
+
+#include <elf.h>
+#include <link.h>
+#include "library_filters.h"
+
+
+/*!
+ ******************************************************************************
+ * \def GOTCHA_CHECK_VISIBILITY(sym)
+ * \brief Checks whether a given symbol is associated with a real function
+ * \param sym The symbol you wish to check
+ ******************************************************************************
+ */
+#define GOTCHA_CHECK_VISIBILITY(sym)((sym.st_size>0))
+
+/*!
+ ******************************************************************************
+ *
+ * \fn signed long lookup_gnu_hash_symbol(const char *name, 
+                                          ElfW(Sym) *syms,
+                                          char *symnames, 
+                                          void *header);
+ *
+ * \brief Looks up the index of a symbol in a symbol table given a symbol name 
+ *
+ * \param name     The name of the function to be looked up
+ * \param syms     The pointer to the symbol table
+ * \param symnames A pointer into the string table
+ * \param header   The parameters the underlying GNU Hash function will use
+ *
+ ******************************************************************************
+ */
+signed long lookup_gnu_hash_symbol(const char *name, ElfW(Sym) *syms, char *symnames, void *sheader);
+
+/*!
+ ******************************************************************************
+ *
+ * \fn signed long lookup_elf_hash_symbol(const char *name, 
+                                          ElfW(Sym) *syms,
+                                          char *symnames, 
+                                          ElfW(Word) *header);
+ *
+ * \brief Looks up the index of a symbol in a symbol table given a symbol name 
+ *
+ * \param name     The name of the function to be looked up
+ * \param syms     The pointer to the symbol table
+ * \param symnames A pointer into the string table
+ * \param header   The parameters the underlying ELF Hash function will use
+ *
+ ******************************************************************************
+ */
+signed long lookup_elf_hash_symbol(const char *name, ElfW(Sym) *syms, char *symnames, ElfW(Word) *header);
+
+/*!
+ ******************************************************************************
+ *
+ * \def INIT_DYNAMIC(lmap)
+ *
+ * \brief This macro initializes a set of variables from an link.h
+ *        link_map entry
+ *
+ * \param lmap         The link map entry associated with this symbol
+ * \param[out] dynsec  The dynamic section associated with the link_map
+ * \param[out] rela
+ * \param[out] rel
+ * \param[out] jmprel
+ * \param[out] symtab
+ * \param[out] gnu_hash
+ * \param[out] got
+ * \param[out] strtab
+ *
+ ******************************************************************************
+ */
+#define INIT_DYNAMIC(lmap)                                      \
+   ElfW(Dyn) *dynsec = NULL, *dentry = NULL;                    \
+   ElfW(Rela) *rela = NULL;                                     \
+   ElfW(Rel) *rel = NULL;                                       \
+   ElfW(Addr) jmprel = 0;                                       \
+   ElfW(Sym) *symtab = NULL;                                    \
+   ElfW(Addr) gnu_hash = 0x0, elf_hash = 0x0;                   \
+   ElfW(Addr) got = 0x0;                                        \
+   char *strtab = NULL;                                         \
+   unsigned int rel_size = 0, rel_count = 0, is_rela = 0, i;    \
+   dynsec = lmap->l_ld;                                         \
+   if (!dynsec)                                                 \
+      return -1;                                                \
+   for (dentry = dynsec; dentry->d_tag != DT_NULL; dentry++) {  \
+      switch (dentry->d_tag) {                                  \
+         case DT_PLTRELSZ: {                                    \
+            rel_size = (unsigned int) dentry->d_un.d_val;       \
+            break;                                              \
+         }                                                      \
+         case DT_PLTGOT: {                                      \
+            got = dentry->d_un.d_ptr;                           \
+            break;                                              \
+         }                                                      \
+         case DT_HASH: {                                        \
+            elf_hash = dentry->d_un.d_val;                      \
+            break;                                              \
+         }                                                      \
+         case DT_STRTAB: {                                      \
+            strtab = (char *) dentry->d_un.d_ptr;               \
+            break;                                              \
+         }                                                      \
+         case DT_SYMTAB: {                                      \
+            symtab = (ElfW(Sym) *) dentry->d_un.d_ptr;          \
+            break;                                              \
+         }                                                      \
+         case DT_PLTREL: {                                      \
+            is_rela = (dentry->d_un.d_val == DT_RELA);          \
+            break;                                              \
+         }                                                      \
+         case DT_JMPREL: {                                      \
+            jmprel = dentry->d_un.d_val;                        \
+            break;                                              \
+         }                                                      \
+         case DT_GNU_HASH: {                                    \
+            gnu_hash = dentry->d_un.d_val;                      \
+            break;                                              \
+         }                                                      \
+      }                                                         \
+   }                                                            \
+   rel_count = rel_size / (is_rela ? sizeof(ElfW(Rela)) : sizeof(ElfW(Rel))); \
+   (void) rela;                                                 \
+   (void) rel;                                                  \
+   (void) jmprel;                                               \
+   (void) symtab;                                               \
+   (void) gnu_hash;                                             \
+   (void) elf_hash;                                             \
+   (void) got;                                                  \
+   (void) strtab;                                               \
+   (void) rel_size;                                             \
+   (void) rel_count;                                            \
+   (void) is_rela;                                              \
+   (void) i;
+
+/*!
+ ******************************************************************************
+ *
+ * \def FOR_EACH_PLTREL_INT(relptr, op, ...)
+ *
+ * \brief This macro calls an operation on each relocation in a dynamic section.
+ *        It should be called only by FOR_EACH_PLTREL, users should not call it
+ *
+ * \param relptr       The pointer to the first relocation in the section
+ * \param op           The operation to be performed on each relocation
+ * \param ...          Any additional arguments you wish to pass to op
+ *
+ ******************************************************************************
+ */
+#define FOR_EACH_PLTREL_INT(relptr, op, ...)                        \
+   for (i = 0; i < rel_count; i++) {                           \
+      ElfW(Addr) offset = relptr[i].r_offset;                  \
+      unsigned long symidx = R_SYM(relptr[i].r_info);          \
+      ElfW(Sym) *sym = symtab + symidx;                        \
+      char *symname = strtab + sym->st_name;                   \
+      op(sym, symname, offset, ## __VA_ARGS__);                                \
+   }
+
+/*!
+ ******************************************************************************
+ *
+ * \def FOR_EACH_PLTREL(lmap, op, ...)
+ *
+ * \brief This macro calls an operation on each relocation in the dynamic section
+ *        associated with a link map entry
+ *
+ * \param lmap         The link map whose relocations you want processed
+ * \param op           The operation to be performed on each relocation
+ * \param ...          Any additional arguments you wish to pass to op
+ *
+ ******************************************************************************
+ */
+#define FOR_EACH_PLTREL(lmap, op, ...) {                            \
+      INIT_DYNAMIC(lmap)                                       \
+      ElfW(Addr) offset = lmap->l_addr;                        \
+      (void) offset;                                           \
+      if (is_rela) {                                           \
+         rela = (ElfW(Rela) *) jmprel;                         \
+         FOR_EACH_PLTREL_INT(rela, op, ## __VA_ARGS__);                        \
+      }                                                        \
+      else {                                                   \
+         rel = (ElfW(Rel) *) jmprel;                           \
+         FOR_EACH_PLTREL_INT(rel, op, ## __VA_ARGS__);                         \
+      }                                                        \
+   }
+
+
+#endif
diff --git a/ext/GOTCHA/src/example/CMakeLists.txt b/ext/GOTCHA/src/example/CMakeLists.txt
new file mode 100644
index 000000000..4f1828870
--- /dev/null
+++ b/ext/GOTCHA/src/example/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(autotee)
+add_subdirectory(minimal)
diff --git a/ext/GOTCHA/src/example/autotee/CMakeLists.txt b/ext/GOTCHA/src/example/autotee/CMakeLists.txt
new file mode 100644
index 000000000..dff914af8
--- /dev/null
+++ b/ext/GOTCHA/src/example/autotee/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_library(autotee SHARED autotee.c)
+add_executable(autotee_test test_autotee.c)
+target_link_libraries(autotee gotcha)
+target_link_libraries(autotee_test autotee)
+
+
diff --git a/ext/GOTCHA/src/example/autotee/autotee.c b/ext/GOTCHA/src/example/autotee/autotee.c
new file mode 100644
index 000000000..747f6fc81
--- /dev/null
+++ b/ext/GOTCHA/src/example/autotee/autotee.c
@@ -0,0 +1,211 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+/**
+ * autotee -
+ * Using gotcha, wrap the major IO writing routines with functions that 
+ * "tee" any stdout output to another file.  Init by calling
+ *    init_autotee(filename)
+ * finish by calling:
+ *    close_autotee()
+ *
+ * Note, this is a demonstration program for gotcha and does not handle
+ * cases like stdout's file descriptor being dup'd or more esoteric 
+ * IO routines.
+ **/
+
+
+#include "gotcha/gotcha_types.h"
+#include "gotcha/gotcha.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+static int tee_fd = -1;
+static FILE *tee_FILE = NULL;
+
+static int printf_wrapper(const char *format, ...);
+static int fprintf_wrapper(FILE *stream, const char *format, ...);
+static int vfprintf_wrapper(FILE *stream, const char *str, va_list args);
+static int vprintf_wrapper(const char *str, va_list args);
+static ssize_t write_wrapper(int fd, const void *buffer, size_t size);
+static int puts_wrapper(const char *str);
+static int fputs_wrapper(const char *str, FILE *f);
+static int fwrite_wrapper(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+
+static gotcha_wrappee_handle_t orig_printf_handle;
+static gotcha_wrappee_handle_t orig_fprintf_handle;
+static gotcha_wrappee_handle_t orig_vfprintf_handle;
+static gotcha_wrappee_handle_t orig_vprintf_handle;
+static gotcha_wrappee_handle_t orig_write_handle;
+static gotcha_wrappee_handle_t orig_puts_handle;
+static gotcha_wrappee_handle_t orig_fputs_handle;
+static gotcha_wrappee_handle_t orig_fwrite_handle;
+
+
+#define NUM_IOFUNCS 8
+struct gotcha_binding_t iofuncs[] = {
+   { "printf", printf_wrapper, &orig_printf_handle },
+   { "fprintf", fprintf_wrapper, &orig_fprintf_handle },
+   { "vfprintf", vfprintf_wrapper, &orig_vfprintf_handle },
+   { "vprintf", vprintf_wrapper, &orig_vprintf_handle },
+   { "write", write_wrapper, &orig_write_handle },
+   { "puts", puts_wrapper, &orig_puts_handle },
+   { "fputs", fputs_wrapper, &orig_fputs_handle },
+   { "fwrite", fwrite_wrapper, &orig_fwrite_handle }
+};
+
+int init_autotee(const char *teefile)
+{
+   enum gotcha_error_t result;
+   gotcha_set_priority("testing/whether/this/works", 1);
+   tee_FILE = fopen(teefile, "w");
+   if (!tee_FILE) {
+      perror("Failed to open tee file");
+      return -1;
+   }
+   tee_fd = fileno(tee_FILE);
+
+   result = gotcha_wrap(iofuncs, NUM_IOFUNCS, "testing/whether");
+   if (result != GOTCHA_SUCCESS) {
+      fprintf(stderr, "gotcha_wrap returned %d\n", (int) result);
+      return -1;
+   }
+
+   return 0;
+}
+
+int close_autotee()
+{
+   if (tee_FILE) {
+      fclose(tee_FILE);
+      tee_fd = -1;
+   }
+   return 0;
+}
+
+static int printf_wrapper(const char *format, ...)
+{
+   typeof(&vfprintf) orig_vfprintf = gotcha_get_wrappee(orig_vfprintf_handle);
+   typeof(&vprintf) orig_vprintf = gotcha_get_wrappee(orig_vprintf_handle);
+   int result;
+   va_list args, args2;
+   va_start(args, format);
+
+   if (tee_FILE) {
+      va_copy(args2, args);   
+      orig_vfprintf(tee_FILE, format, args2);
+      va_end(args2);
+   }
+
+   result = orig_vprintf(format, args);   
+   va_end(args);
+
+   return result;
+}
+
+static int fprintf_wrapper(FILE *stream, const char *format, ...)
+{
+   typeof(&vfprintf) orig_vfprintf = gotcha_get_wrappee(orig_vfprintf_handle);
+   int result;
+   va_list args, args2;
+   va_start(args, format);
+   
+   if (stream != stdout) {
+      result = orig_vfprintf(stream, format, args);
+   }
+   else {
+      if (tee_FILE) {
+         va_copy(args2, args);
+         orig_vfprintf(tee_FILE, format, args2);
+         va_end(args2);
+      }
+      result = orig_vfprintf(stdout, format, args);
+   }
+
+   va_end(args);
+   return result;
+}
+
+static int vfprintf_wrapper(FILE *stream, const char *str, va_list args)
+{
+   typeof(&vfprintf) orig_vfprintf = gotcha_get_wrappee(orig_vfprintf_handle);
+   va_list args2;
+   if (stream != stdout) {
+      return orig_vfprintf(stream, str, args);
+   }
+   if (tee_FILE) {
+      va_copy(args2, args);
+      orig_vfprintf(tee_FILE, str, args2);
+      va_end(args2);
+   }
+   return orig_vfprintf(stream, str, args);
+}
+
+static int vprintf_wrapper(const char *str, va_list args)
+{
+   typeof(&vfprintf) orig_vfprintf = gotcha_get_wrappee(orig_vfprintf_handle);
+   typeof(&vprintf) orig_vprintf = gotcha_get_wrappee(orig_vprintf_handle);
+   va_list args2;
+   if (tee_FILE) {
+      va_copy(args2, args);
+      orig_vfprintf(tee_FILE, str, args2);
+      va_end(args2);
+   }
+   return orig_vprintf(str, args);
+}
+
+static ssize_t write_wrapper(int fd, const void *buffer, size_t size)
+{
+   typeof(&write) orig_write = gotcha_get_wrappee(orig_write_handle);
+   if (fd != 1)
+      return orig_write(fd, buffer, size);
+   
+   if (tee_fd != -1) 
+      orig_write(tee_fd, buffer, size);
+   return orig_write(fd, buffer, size);
+}
+
+static int puts_wrapper(const char *str)
+{
+   typeof(&fputs) orig_fputs = gotcha_get_wrappee(orig_fputs_handle);
+   typeof(&puts) orig_puts = gotcha_get_wrappee(orig_puts_handle);
+   if (tee_FILE) {
+      orig_fputs(str, tee_FILE);
+      orig_fputs("\n", tee_FILE);
+   }
+   return orig_puts(str);
+}
+
+static int fputs_wrapper(const char *str, FILE *f)
+{
+   typeof(&fputs) orig_fputs = gotcha_get_wrappee(orig_fputs_handle);
+   if (f != stdout)
+      return orig_fputs(str, f);
+   if (tee_FILE)
+      orig_fputs(str, tee_FILE);
+   return orig_fputs(str, f);
+}
+
+static int fwrite_wrapper(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+   typeof(&fwrite) orig_fwrite = gotcha_get_wrappee(orig_fwrite_handle);
+   if (stream != stdout) 
+      return orig_fwrite(ptr, size, nmemb, stream);
+   if (tee_FILE)
+      orig_fwrite(ptr, size, nmemb, tee_FILE);
+   return orig_fwrite(ptr, size, nmemb, stream);
+}
diff --git a/ext/GOTCHA/src/example/autotee/test_autotee.c b/ext/GOTCHA/src/example/autotee/test_autotee.c
new file mode 100644
index 000000000..704ebb82c
--- /dev/null
+++ b/ext/GOTCHA/src/example/autotee/test_autotee.c
@@ -0,0 +1,47 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdio.h>
+#include <string.h>
+
+extern int init_autotee(char *filename);
+extern int close_autotee();
+
+#define OUTPUT_FILE "tee.out"
+
+int main()
+{
+   int result;
+
+   printf("Every stdout print after this line should also appear in %s:\n", OUTPUT_FILE);
+
+   result = init_autotee(OUTPUT_FILE);
+   if (result != 0)
+      return -1;
+
+   printf("First line\n");
+   printf("Second %s\n", "line");
+   fprintf(stdout, "Third line\n");
+   fprintf(stdout, "%s line\n", "Forth");
+   puts("Fifth line");
+   fputs("Sixth ", stdout);
+   fputs("line\n", stdout);
+   fwrite("Seventh line\n", 1, strlen("Seventh line\n"), stdout);
+   fprintf(stderr, "Eighth line is stderr and should not appear in in %s\n", OUTPUT_FILE);
+   close_autotee();
+   printf("Ninth line is after close and should not appear in %s\n", OUTPUT_FILE);
+
+   return 0;
+}
diff --git a/ext/GOTCHA/src/example/minimal/CMakeLists.txt b/ext/GOTCHA/src/example/minimal/CMakeLists.txt
new file mode 100644
index 000000000..be620b3a9
--- /dev/null
+++ b/ext/GOTCHA/src/example/minimal/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(wrap_me SHARED sampleLib.c)
+add_executable(symb_look symbolLookup.c)
+target_link_libraries(wrap_me gotcha)
+target_link_libraries(symb_look wrap_me gotcha)
+
diff --git a/ext/GOTCHA/src/example/minimal/sampleLib.c b/ext/GOTCHA/src/example/minimal/sampleLib.c
new file mode 100644
index 000000000..03e8b98c2
--- /dev/null
+++ b/ext/GOTCHA/src/example/minimal/sampleLib.c
@@ -0,0 +1,63 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdio.h>
+#include "sampleLib.h"
+
+//We need a place to store the pointer to the function we've wrapped
+gotcha_wrappee_handle_t origRetX_handle;
+
+/**
+  * We need to express our desired wrapping behavior to
+  * GOTCHA. For that we need three things:
+  *
+  * 1) The name of a symbol to wrap
+  * 2) The function we want to wrap it with
+  * 3) Some place to store the original function, if we wish
+  *    to call it
+  *
+  * This variable bindings gets filled out with a list of three
+  * element structs containing those things.
+  *
+  * Note that the place to store the original function is passed
+  * by reference, this is required for us to be able to change it
+  */
+struct gotcha_binding_t bindings[] = {{"retX", dogRetX, &origRetX_handle}};
+
+// This is like a tool library's initialization function
+int sample_init()
+{
+  gotcha_wrap(bindings, 1, "gotcha_internal_sample_tool");
+  return 0;
+}
+
+/**
+  * In our example, this is the function we're wrapping.
+  * For convenience, it's in the same library, but this
+  * isn't a requirement imposed by GOTCHA
+  */
+int retX(int x) { return x; }
+
+/** 
+  * This is our wrapper function. All GOTCHA wrappers *must*
+  * reference dogs somewhere in the code. I didn't write the
+  * rules (yes I did)
+  */
+int dogRetX(int x)
+{
+  typeof(&dogRetX) origRetX = gotcha_get_wrappee(origRetX_handle);
+  printf("SO I FOR ONE THINK DOGS SHOULD RETURN %i\n", x);
+  return origRetX ? origRetX(x) + 1 : 0;
+}
diff --git a/ext/GOTCHA/src/example/minimal/sampleLib.h b/ext/GOTCHA/src/example/minimal/sampleLib.h
new file mode 100644
index 000000000..aa229af5c
--- /dev/null
+++ b/ext/GOTCHA/src/example/minimal/sampleLib.h
@@ -0,0 +1,23 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free Software
+Foundation) version 2.1 dated February 1999.  This program is distributed in the
+hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED
+WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms
+and conditions of the GNU Lesser General Public License for more details.  You should
+have received a copy of the GNU Lesser General Public License along with this
+program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#ifndef SAMPLE_LIB_H
+#define SAMPLE_LIB_H
+#include "gotcha/gotcha.h"
+int sample_init();
+int retX(int x);
+int dogRetX(int x);
+void* dog_malloc(int size);
+void* mylloc(int size);
+#endif
diff --git a/ext/GOTCHA/src/example/minimal/symbolLookup.c b/ext/GOTCHA/src/example/minimal/symbolLookup.c
new file mode 100644
index 000000000..ce8450dd6
--- /dev/null
+++ b/ext/GOTCHA/src/example/minimal/symbolLookup.c
@@ -0,0 +1,32 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <assert.h>
+#include <elf.h>
+#include <errno.h>
+#include <link.h>
+#include <stdio.h>
+#include <sys/mman.h>
+
+#include "sampleLib.h"
+
+void dbg() {}
+int main()
+{
+  sample_init();
+  int check_val = retX(9);
+  assert(check_val == 10);
+  return 0;
+}
diff --git a/ext/GOTCHA/src/gotcha.c b/ext/GOTCHA/src/gotcha.c
new file mode 100644
index 000000000..8e9524d65
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha.c
@@ -0,0 +1,385 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "translations.h"
+#include "libc_wrappers.h"
+#include "gotcha/gotcha.h"
+#include "gotcha/gotcha_types.h"
+#include "gotcha_utils.h"
+#include "gotcha_auxv.h"
+#include "gotcha_dl.h"
+#include "elf_ops.h"
+#include "tool.h"
+
+static void writeAddress(void* write, void* value){
+  *(void**)write = value;
+}
+
+static void** getBindingAddressPointer(struct gotcha_binding_t* in){
+  return (void**)in->function_handle;
+}
+
+static void setBindingAddressPointer(struct gotcha_binding_t* in, void* value){
+   void **target = getBindingAddressPointer(in);
+   debug_printf(3, "Updating binding address pointer at %p to %p\n", target, value);
+   writeAddress(target, value);
+}
+
+static void** getInternalBindingAddressPointer(struct internal_binding_t** in){
+  return (void**)&((*in)->wrappee_pointer);
+}
+
+static void setInternalBindingAddressPointer(void** in, void* value){
+  void** target = getInternalBindingAddressPointer((struct internal_binding_t**)in);
+  debug_printf(3, "Updating binding address pointer at %p to %p\n", target, value);
+  writeAddress(target, value);
+}
+
+int prepare_symbol(struct internal_binding_t *binding)
+{
+   int result;
+   struct link_map *lib;
+   struct gotcha_binding_t *user_binding = binding->user_binding;
+
+   debug_printf(2, "Looking up exported symbols for %s\n", user_binding->name);
+   for (lib = _r_debug.r_map; lib != 0; lib = lib->l_next) {
+      struct library_t *int_library = get_library(lib);
+      if (!int_library) {
+         debug_printf(3, "Creating new library object for %s\n", LIB_NAME(lib));
+         int_library = add_library(lib);
+      }
+      
+      if (is_vdso(lib)) {
+         debug_printf(2, "Skipping VDSO library at 0x%lx with name %s\n",
+                      lib->l_addr, LIB_NAME(lib));
+         continue;
+      }
+      debug_printf(2, "Searching for exported symbols in %s\n", LIB_NAME(lib));
+      INIT_DYNAMIC(lib);
+
+      if (!gnu_hash && !elf_hash) {
+         debug_printf(3, "Library %s does not export or import symbols\n", LIB_NAME(lib));
+         continue;
+      }
+      result = -1;
+      if (gnu_hash) {
+         debug_printf(3, "Checking GNU hash for %s in %s\n",
+                      user_binding->name, LIB_NAME(lib));
+         result = lookup_gnu_hash_symbol(user_binding->name, symtab, strtab,
+                                         (struct gnu_hash_header *) gnu_hash);
+      }
+      if (elf_hash && result == -1) {
+         debug_printf(3, "Checking ELF hash for %s in %s\n",
+                      user_binding->name, LIB_NAME(lib));
+         result = lookup_elf_hash_symbol(user_binding->name, symtab, strtab,
+                                         (ElfW(Word) *)elf_hash);
+      }
+      if (result == -1) {
+         debug_printf(3, "%s not found in %s\n",
+                      user_binding->name, LIB_NAME(lib));
+         continue;
+      }
+      if (! GOTCHA_CHECK_VISIBILITY(symtab[result])) {
+         debug_printf(3, "Symbol %s found but not exported in %s\n", 
+                      user_binding->name, LIB_NAME(lib));
+         continue;
+      }
+
+      debug_printf(2, "Symbol %s found in %s at 0x%lx\n", 
+                   user_binding->name, LIB_NAME(lib),
+                   symtab[result].st_value + lib->l_addr);
+      setInternalBindingAddressPointer(user_binding->function_handle,(void *)(symtab[result].st_value + lib->l_addr));
+      return 0;
+   }
+   debug_printf(1, "Symbol %s was found in program\n", user_binding->name);
+   return -1;
+}
+
+static void insert_at_head(struct internal_binding_t *binding, struct internal_binding_t *head)
+{
+   binding->next_binding = head;
+   setInternalBindingAddressPointer(binding->user_binding->function_handle, head->user_binding->wrapper_pointer);
+   removefrom_hashtable(&function_hash_table, (void*) binding->user_binding->name);
+   addto_hashtable(&function_hash_table, (void*)binding->user_binding->name, (void*)binding);
+}
+
+static void insert_after_pos(struct internal_binding_t *binding, struct internal_binding_t *pos)
+{
+   setInternalBindingAddressPointer(binding->user_binding->function_handle, pos->wrappee_pointer);
+   setInternalBindingAddressPointer(pos->user_binding->function_handle, binding->user_binding->wrapper_pointer);
+   binding->next_binding = pos->next_binding;
+   pos->next_binding = binding;
+}
+
+#define RWO_NOCHANGE 0
+#define RWO_NEED_LOOKUP (1 << 0)
+#define RWO_NEED_BINDING (1 << 1)
+static int rewrite_wrapper_orders(struct internal_binding_t* binding)
+{
+  const char* name = binding->user_binding->name;
+  int insert_priority = get_priority(binding->associated_binding_table->tool);
+  
+  if(gotcha_strcmp(name,"main")==0){
+    if(!main_wrapped){
+      debug_printf(2, "Wrapping main with Gotcha's internal wrappers");
+      main_wrapped = 1;
+      gotcha_wrap(libc_main_wrappers,1,"gotcha");
+      gotcha_wrap(main_wrappers,1,"gotcha");
+    }
+  }
+
+  debug_printf(2, "gotcha_rewrite_wrapper_orders for binding %s in tool %s of priority %d\n",
+               name, binding->associated_binding_table->tool->tool_name, insert_priority);
+
+  struct internal_binding_t* head;
+  int hash_result;
+  hash_result = lookup_hashtable(&function_hash_table, (void*)name, (void**)&head);
+  if(hash_result != 0) {
+    debug_printf(2, "Adding new entry for %s to hash table\n", name);
+    addto_hashtable(&function_hash_table, (void *) name, (void *) binding);
+    return (RWO_NEED_LOOKUP | RWO_NEED_BINDING);
+  }
+
+  int head_priority = get_priority(head->associated_binding_table->tool);
+    if (head_priority < insert_priority) {
+     debug_printf(2, "New binding priority %d is greater than head priority %d, adding to head\n",
+                   insert_priority, head_priority);
+     insert_at_head(binding, head);
+     return RWO_NEED_BINDING;
+  }
+
+  struct internal_binding_t* cur;
+  for (cur = head; cur->next_binding; cur = cur->next_binding) {
+     int next_priority = get_priority(cur->next_binding->associated_binding_table->tool);
+     debug_printf(3, "Comparing binding for new insertion %d to binding for tool %s at %d\n",
+                   insert_priority, cur->next_binding->associated_binding_table->tool->tool_name,
+                   next_priority);
+     if (next_priority < insert_priority) {
+        break;
+     }
+     if (cur->user_binding->wrapper_pointer == binding->user_binding->wrapper_pointer) {
+        debug_printf(3, "Tool is already inserted.  Skipping binding rewrite\n");
+        return RWO_NOCHANGE;
+     }
+  }
+  debug_printf(2, "Inserting binding after tool %s\n", cur->associated_binding_table->tool->tool_name);
+  insert_after_pos(binding, cur);
+  return RWO_NOCHANGE;
+}
+
+static int update_lib_bindings(ElfW(Sym) * symbol KNOWN_UNUSED, char *name, ElfW(Addr) offset,
+                               struct link_map *lmap, hash_table_t *lookuptable)
+{
+  int result;
+  struct internal_binding_t *internal_binding;
+  void **got_address;
+
+  result = lookup_hashtable(lookuptable, name, (void **) &internal_binding);
+  if (result != 0)
+     return 0;
+  got_address = (void**) (lmap->l_addr + offset);
+  writeAddress(got_address, internal_binding->user_binding->wrapper_pointer);
+  debug_printf(3, "Remapped call to %s at 0x%lx in %s to wrapper at 0x%p\n",
+             name, (lmap->l_addr + offset), LIB_NAME(lmap),
+             internal_binding->user_binding->wrapper_pointer);
+  return 0;
+}
+
+#ifndef MAX
+#define MAX(a,b) (a>b?a:b)
+#endif
+
+static int mark_got_writable(struct link_map *lib)
+{
+   static unsigned int page_size = 0;
+   INIT_DYNAMIC(lib);
+   if (!got)
+      return 0;
+
+   if (!page_size)
+      page_size = gotcha_getpagesize();
+
+   size_t protect_size = MAX(rel_size, page_size);
+   if(protect_size % page_size){
+      protect_size += page_size -  ((protect_size) %page_size);
+   }
+   ElfW(Addr) prot_address = BOUNDARY_BEFORE(got,(ElfW(Addr))page_size);
+   debug_printf(3, "Setting library %s GOT table from %p to +%lu to writeable\n",
+                LIB_NAME(lib), (void *) prot_address, protect_size);
+   int res = gotcha_mprotect((void*)prot_address,protect_size,PROT_READ | PROT_WRITE | PROT_EXEC );
+   if(res == -1){ // mprotect returns -1 on an error
+      error_printf("GOTCHA attempted to mark the GOT table as writable and was unable to do so, "
+                   "calls to wrapped functions may likely fail.\n");
+   }
+
+   return 0;
+}
+
+static int update_library_got(struct link_map *map, hash_table_t *bindingtable)
+{
+   struct library_t *lib = get_library(map);
+   if (!lib) {
+      debug_printf(3, "Creating new library object for %s\n", LIB_NAME(map));
+      lib = add_library(map);
+   }
+
+   if (!libraryFilterFunc(map)) {
+      debug_printf(3, "Skipping library %s due to libraryFilterFunc\n", LIB_NAME(map));
+      return 0;
+   }
+
+   if (lib->generation == current_generation) {
+      debug_printf(2, "Library %s is already up-to-date.  Skipping GOT rewriting\n", LIB_NAME(map));
+      return 0;
+   }
+   
+   if (!(lib->flags & LIB_GOT_MARKED_WRITEABLE)) {
+      mark_got_writable(map);
+      lib->flags |= LIB_GOT_MARKED_WRITEABLE;
+   }
+
+   FOR_EACH_PLTREL(map, update_lib_bindings, map, bindingtable);
+
+   lib->generation = current_generation;
+   return 0;
+}
+
+void update_all_library_gots(hash_table_t *bindings)
+{
+   struct link_map *lib_iter;
+   debug_printf(2, "Searching all callsites for %lu bindings\n", (unsigned long) bindings->entry_count);
+   for (lib_iter = _r_debug.r_map; lib_iter != 0; lib_iter = lib_iter->l_next) {
+      update_library_got(lib_iter, bindings);
+   }   
+}
+
+GOTCHA_EXPORT enum gotcha_error_t gotcha_wrap(struct gotcha_binding_t* user_bindings, int num_actions, const char* tool_name)
+{
+  int i, not_found = 0, new_bindings_count = 0;
+  tool_t *tool;
+  hash_table_t new_bindings;
+
+  gotcha_init();
+
+  debug_printf(1, "User called gotcha_wrap for tool %s with %d bindings\n",
+               tool_name, num_actions);
+  if (debug_level >= 3) {
+    for (i = 0; i < num_actions; i++) {
+       debug_bare_printf(3, "\t%d: %s will map to %p\n", i, user_bindings[i].name,
+                         user_bindings[i].wrapper_pointer);
+    }
+  }
+  debug_printf(3, "Initializing %d user binding entries to NULL\n", num_actions);
+  for (i = 0; i < num_actions; i++) {
+    setBindingAddressPointer(&user_bindings[i], NULL);
+  }
+
+  if (!tool_name)
+     tool_name = "[UNSPECIFIED]";
+  tool = get_tool(tool_name);
+  if (!tool)
+     tool = create_tool(tool_name);
+  if (!tool) {
+     error_printf("Failed to create tool %s\n", tool_name);
+     return GOTCHA_INTERNAL;
+  }
+
+  current_generation++;
+  debug_printf(2, "Moved current_generation to %u in gotcha_wrap\n", current_generation);
+
+  debug_printf(2, "Creating internal binding data structures and adding binding to tool\n");
+  binding_t *bindings = add_binding_to_tool(tool, user_bindings, num_actions);
+  if (!bindings) {
+     error_printf("Failed to create bindings for tool %s\n", tool_name);
+     return GOTCHA_INTERNAL;
+  }
+
+  debug_printf(2, "Processing %d bindings\n", num_actions);
+  for (i = 0; i < num_actions; i++) {
+     struct internal_binding_t *binding = bindings->internal_bindings + i;
+
+     int result = rewrite_wrapper_orders(binding);
+     if (result & RWO_NEED_LOOKUP) {
+        debug_printf(2, "Symbol %s needs lookup operation\n", binding->user_binding->name);
+        int presult = prepare_symbol(binding);
+        if (presult == -1) {
+           debug_printf(2, "Stashing %s in notfound_binding table to re-lookup on dlopens\n",
+                        binding->user_binding->name);
+           addto_hashtable(&notfound_binding_table, (hash_key_t) binding->user_binding->name, (hash_data_t) binding);
+           not_found++;
+        }
+     }
+     if (result & RWO_NEED_BINDING) {
+        debug_printf(2, "Symbol %s needs binding from application\n", binding->user_binding->name);
+        if (!new_bindings_count) {
+           create_hashtable(&new_bindings, num_actions*2, (hash_func_t) strhash, (hash_cmp_t) gotcha_strcmp);
+        }
+        addto_hashtable(&new_bindings, (void *) binding->user_binding->name, (void *) binding);
+        new_bindings_count++;
+     }
+  }
+  
+  if (new_bindings_count) {
+     update_all_library_gots(&new_bindings);
+     destroy_hashtable(&new_bindings);
+  }
+
+  if (not_found) {
+     debug_printf(1, "Could not find bindings for %d / %d functions\n", not_found, num_actions);
+     return GOTCHA_FUNCTION_NOT_FOUND;
+  }
+  debug_printf(1, "Gotcha wrap completed successfully\n");
+  return GOTCHA_SUCCESS;
+}
+
+static enum gotcha_error_t gotcha_configure_int(const char* tool_name, enum gotcha_config_key_t configuration_key , int value){
+  tool_t * tool = get_tool(tool_name);
+  if(tool==NULL){
+    tool = create_tool(tool_name);
+  }
+  if( configuration_key == GOTCHA_PRIORITY){
+    tool->config.priority = value;
+  }
+  else{
+    error_printf("Invalid property being configured on tool %s\n", tool_name);
+    return GOTCHA_INTERNAL;
+  }
+  return GOTCHA_SUCCESS;
+}
+
+GOTCHA_EXPORT enum gotcha_error_t gotcha_set_priority(const char* tool_name, int value){
+  gotcha_init();
+  debug_printf(1, "User called gotcha_set_priority(%s, %d)\n", tool_name, value);
+  enum gotcha_error_t error_on_set = gotcha_configure_int(tool_name, GOTCHA_PRIORITY, value);
+  if(error_on_set != GOTCHA_SUCCESS) {
+    return error_on_set;
+  }
+  tool_t* tool_to_place = get_tool(tool_name);
+  if(!tool_to_place){
+     tool_to_place = create_tool(tool_name);
+  }
+  remove_tool_from_list(tool_to_place);
+  reorder_tool(tool_to_place);
+  return GOTCHA_SUCCESS;
+}
+
+GOTCHA_EXPORT enum gotcha_error_t gotcha_get_priority(const char* tool_name, int *priority){
+  gotcha_init();
+  return get_configuration_value(tool_name, GOTCHA_PRIORITY, priority);
+}
+
+GOTCHA_EXPORT void* gotcha_get_wrappee(gotcha_wrappee_handle_t handle){
+  return ((struct internal_binding_t*)handle)->wrappee_pointer;
+}
diff --git a/ext/GOTCHA/src/gotcha_auxv.c b/ext/GOTCHA/src/gotcha_auxv.c
new file mode 100644
index 000000000..9f4c01e96
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_auxv.c
@@ -0,0 +1,294 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "gotcha_auxv.h"
+#include "gotcha_utils.h"
+#include "libc_wrappers.h"
+
+#include <elf.h>
+#include <link.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+static ElfW(Ehdr) *vdso_ehdr = NULL;
+static unsigned int auxv_pagesz = 0;
+
+
+int parse_auxv_contents()
+{
+   char name[] = "/proc/self/auxv";
+   int fd, done = 0;
+   char buffer[4096];
+   ssize_t buffer_size = 4096, offset = 0, result;
+   ElfW(auxv_t) *auxv, *a;
+   static int parsed_auxv = 0;
+
+   if (parsed_auxv)
+      return parsed_auxv == -1 ? parsed_auxv : 0;
+   parsed_auxv = 1;
+
+   fd = gotcha_open(name, O_RDONLY);
+   if (fd == -1) {
+      parsed_auxv = -1;
+      return -1;
+   }
+
+   do {
+      for (;;) {
+         result = gotcha_read(fd, buffer+offset, buffer_size-offset);
+         if (result == -1) {
+            if (errno == EINTR)
+               continue;
+            gotcha_close(fd);
+            parsed_auxv = -1;
+            return -1;
+         }
+         if (result == 0) {
+            gotcha_close(fd);
+            done = 1;
+            break;
+         }
+         if (offset == buffer_size) {
+            break;
+         }
+         offset += result;
+      }
+
+      auxv = (ElfW(auxv_t) *) buffer;
+      for (a = auxv; a->a_type != AT_NULL; a++) {
+         if (a->a_type == AT_SYSINFO_EHDR) {
+            vdso_ehdr = (ElfW(Ehdr) *) a->a_un.a_val;
+         }
+         else if (a->a_type == AT_PAGESZ) {
+            auxv_pagesz = (int) a->a_un.a_val;
+         }
+      }
+   } while (!done);
+
+   return 0;
+}
+
+struct link_map *get_vdso_from_auxv()
+{
+   struct link_map *m;
+
+   ElfW(Phdr) *vdso_phdrs = NULL;
+   ElfW(Half) vdso_phdr_num, p;
+   // Initialization with zero added by Thomas Gruber to avoid warning
+   ElfW(Addr) vdso_dynamic = 0;
+
+   parse_auxv_contents();
+   if (!vdso_ehdr)
+      return NULL;
+   
+   vdso_phdrs = (ElfW(Phdr) *) (vdso_ehdr->e_phoff + ((unsigned char *) vdso_ehdr));
+   vdso_phdr_num = vdso_ehdr->e_phnum;
+
+   for (p = 0; p < vdso_phdr_num; p++) {
+      if (vdso_phdrs[p].p_type == PT_DYNAMIC) {
+         vdso_dynamic = (ElfW(Addr)) vdso_phdrs[p].p_vaddr;
+      }
+   }
+
+   for (m = _r_debug.r_map; m; m = m->l_next) {
+      if (m->l_addr + vdso_dynamic == (ElfW(Addr)) m->l_ld) {
+         return m;
+      }
+   }
+   return NULL;
+}
+
+unsigned int get_auxv_pagesize()
+{
+   int result;
+   result = parse_auxv_contents();
+   if (result == -1)
+      return 0;
+   return auxv_pagesz;
+}
+
+static char* vdso_aliases[] = { "linux-vdso.so",
+                                "linux-gate.so",
+                                NULL };
+
+struct link_map *get_vdso_from_aliases()
+{
+   struct link_map *m;
+   char **aliases;
+
+   for (m = _r_debug.r_map; m; m = m->l_next) {
+      for (aliases = vdso_aliases; *aliases; aliases++) {
+         if (m->l_name && gotcha_strcmp(m->l_name, *aliases) == 0) {
+            return m;
+         }
+      }
+   }
+   return NULL;
+}
+
+static int read_line(char *line, int size, int fd)
+{
+   int i;
+   for (i = 0; i < size - 1; i++) {
+      int result = gotcha_read(fd, line + i, 1);
+      if (result == -1 && errno == EINTR)
+         continue;
+      if (result == -1 || result == 0) {
+         line[i] = '\0';
+         return -1;
+      }
+      if (line[i] == '\n') {
+         line[i + 1] = '\0';
+         return 0;
+      }
+   }
+   line[size-1] = '\0';
+   return 0;
+}
+
+static int read_hex(char *str, unsigned long *val)
+{
+   unsigned long local_val = 0, len = 0;
+   for (;;) {
+      if (*str >= '0' && *str <= '9') {
+         local_val = (local_val * 16) + (*str - '0');
+         len++;
+      }
+      else if (*str >= 'a' && *str <= 'f') {
+         local_val = (local_val * 16) + (*str - 'a' + 10);
+         len++;
+      }
+      else if (*str >= 'A' && *str <= 'F') {
+         local_val = (local_val * 16) + (*str - 'A' + 10);
+         len++;
+      }
+      else {
+         *val = local_val;
+         return len;
+      }
+      str++;
+   }
+}
+
+static int read_word(char *str, char *word, int word_size) 
+{
+   int word_cur = 0;
+   int len = 0;
+   while (*str == ' ' || *str == '\t' || *str == '\n') {
+      str++;
+      len++;
+   }
+   if (*str == '\0') {
+      *word = '\0';
+      return len;
+   }
+   while (*str != ' ' && *str != '\t' && *str != '\n' && *str != '\0') {
+      if (word && word_cur >= word_size) {
+         if (word_size > 0 && word)
+            word[word_size-1] = '\0';
+         return word_cur;
+      }
+      if (word)
+         word[word_cur] = *str;
+      word_cur++;
+      str++;
+      len++;
+   }
+   if (word_cur >= word_size)
+      word_cur--;
+   if (word)
+      word[word_cur] = '\0';
+   return len;
+}
+
+struct link_map *get_vdso_from_maps()
+{
+   int maps, hit_eof;
+   ElfW(Addr) addr_begin, addr_end, dynamic;
+   char name[4096], line[4096], *line_pos;
+   struct link_map *m;
+   maps = gotcha_open("/proc/self/maps", O_RDONLY);
+   for (;;) {
+      hit_eof = read_line(line, 4097, maps);
+      if (hit_eof) {
+         gotcha_close(maps);
+         return NULL;
+      }
+      line_pos = line;
+      line_pos += read_hex(line_pos, &addr_begin);
+      if (*line_pos != '-')
+         continue;
+      line_pos++;
+      line_pos += read_hex(line_pos, &addr_end);
+      line_pos += read_word(line_pos, NULL, 0);
+      line_pos += read_word(line_pos, NULL, 0);
+      line_pos += read_word(line_pos, NULL, 0);
+      line_pos += read_word(line_pos, NULL, 0);
+      line_pos += read_word(line_pos, name, sizeof(name));
+      if (gotcha_strcmp(name, "[vdso]") == 0) {
+         gotcha_close(maps);
+         break;
+      }
+   }
+
+   for (m = _r_debug.r_map; m; m = m->l_next) {
+      dynamic = (ElfW(Addr)) m->l_ld;
+      if (dynamic >= addr_begin && dynamic < addr_end)
+         return m;
+   }
+   
+   return NULL;
+}
+
+int is_vdso(struct link_map *map)
+{
+   static int vdso_checked = 0;
+   static struct link_map *vdso = NULL;
+   struct link_map *result;
+
+   if (!map)
+      return 0;
+   if (vdso_checked)
+      return (map == vdso);
+   
+   vdso_checked = 1;
+
+   result = get_vdso_from_aliases();
+   if (result) {
+      vdso = result;
+      return (map == vdso);
+   }
+
+   result = get_vdso_from_auxv();
+   if (result) {
+      vdso = result;
+      return (map == vdso);
+   }
+
+   result = get_vdso_from_maps();
+   if (result) {
+      vdso = result;
+      return (map == vdso);
+   }
+
+   return 0;
+}
diff --git a/ext/GOTCHA/src/gotcha_auxv.h b/ext/GOTCHA/src/gotcha_auxv.h
new file mode 100644
index 000000000..576a018b2
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_auxv.h
@@ -0,0 +1,41 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef GOTCHA_AUXV_H
+#define GOTCHA_AUXV_H
+
+#include <elf.h>
+#include <link.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+int is_vdso(struct link_map *map);
+unsigned int get_auxv_pagesize();
+
+//Do not use, exposed only for unit testing
+int parse_auxv_contents();
+struct link_map *get_vdso_from_auxv();
+struct link_map *get_vdso_from_aliases();
+struct link_map *get_vdso_from_maps();
+
+
+
+#endif
diff --git a/ext/GOTCHA/src/gotcha_dl.c b/ext/GOTCHA/src/gotcha_dl.c
new file mode 100644
index 000000000..7351a96bf
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_dl.c
@@ -0,0 +1,75 @@
+#define _GNU_SOURCE
+#include "gotcha_dl.h"
+#include "tool.h"
+#include "libc_wrappers.h"
+#include "elf_ops.h"
+#include <dlfcn.h>
+
+void* _dl_sym(void* handle, const char* name, void* where);
+
+gotcha_wrappee_handle_t orig_dlopen_handle;
+gotcha_wrappee_handle_t orig_dlsym_handle;
+
+static int per_binding(hash_key_t key, hash_data_t data, void *opaque KNOWN_UNUSED)
+{
+   int result;
+   struct internal_binding_t *binding = (struct internal_binding_t *) data;
+
+   debug_printf(3, "Trying to re-bind %s from tool %s after dlopen\n",
+                binding->user_binding->name, binding->associated_binding_table->tool->tool_name);
+   
+   while (binding->next_binding) {
+      binding = binding->next_binding;
+      debug_printf(3, "Selecting new innermost version of binding %s from tool %s.\n",
+                   binding->user_binding->name, binding->associated_binding_table->tool->tool_name);
+   }
+   
+   result = prepare_symbol(binding);
+   if (result == -1) {
+      debug_printf(3, "Still could not prepare binding %s after dlopen\n", binding->user_binding->name);
+      return 0;
+   }
+
+   removefrom_hashtable(&notfound_binding_table, key);
+   return 0;
+}
+
+static void* dlopen_wrapper(const char* filename, int flags) {
+   typeof(&dlopen_wrapper) orig_dlopen = gotcha_get_wrappee(orig_dlopen_handle);
+   void *handle;
+   debug_printf(1, "User called dlopen(%s, 0x%x)\n", filename, (unsigned int) flags);
+   handle = orig_dlopen(filename,flags);
+
+   debug_printf(2, "Searching new dlopened libraries for previously-not-found exports\n");
+   foreach_hash_entry(&notfound_binding_table, NULL, per_binding);
+
+   debug_printf(2, "Updating GOT entries for new dlopened libraries\n");
+   update_all_library_gots(&function_hash_table);
+  
+   return handle;
+}
+
+static void* dlsym_wrapper(void* handle, const char* symbol_name){
+  typeof(&dlsym_wrapper) orig_dlsym = gotcha_get_wrappee(orig_dlsym_handle);
+  struct internal_binding_t *binding;
+  int result;
+  
+  if(handle == RTLD_NEXT){
+    return _dl_sym(RTLD_NEXT, symbol_name ,__builtin_return_address(0));
+  }
+  
+  result = lookup_hashtable(&function_hash_table, (hash_key_t) symbol_name, (hash_data_t *) &binding);
+  if (result == -1)
+     return orig_dlsym(handle, symbol_name);
+  else
+     return binding->user_binding->wrapper_pointer;
+}
+
+struct gotcha_binding_t dl_binds[] = {
+  {"dlopen", dlopen_wrapper, &orig_dlopen_handle},
+  {"dlsym", dlsym_wrapper, &orig_dlsym_handle}
+};     
+void handle_libdl(){
+  gotcha_wrap(dl_binds, 2, "gotcha");
+}
+
diff --git a/ext/GOTCHA/src/gotcha_dl.h b/ext/GOTCHA/src/gotcha_dl.h
new file mode 100644
index 000000000..a1117f448
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_dl.h
@@ -0,0 +1,15 @@
+#ifndef GOTCHA_DL_H
+#define GOTCHA_DL_H
+
+#include "hash.h"
+#include "tool.h"
+
+void handle_libdl();
+extern void update_all_library_gots(hash_table_t *bindings);
+extern int prepare_symbol(struct internal_binding_t *binding);
+
+extern gotcha_wrappee_handle_t orig_dlopen_handle;
+extern gotcha_wrappee_handle_t orig_dlsym_handle;
+
+extern struct gotcha_binding_t dl_binds[];
+#endif
diff --git a/ext/GOTCHA/src/gotcha_utils.c b/ext/GOTCHA/src/gotcha_utils.c
new file mode 100644
index 000000000..1f43f38fb
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_utils.c
@@ -0,0 +1,124 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "gotcha_utils.h"
+#include "gotcha_dl.h"
+#include "tool.h"
+#include "libc_wrappers.h"
+#include "elf_ops.h"
+#include "gotcha/gotcha.h"
+#include <stdlib.h>
+#include "hash.h"
+
+int debug_level;
+static void debug_init()
+{
+   static int debug_initialized = 0;
+
+   char *debug_str;
+   if (debug_initialized) {
+      return;
+   }
+   debug_initialized = 1;
+   
+   debug_str = gotcha_getenv(GOTCHA_DEBUG_ENV);
+   if (!debug_str) {
+      return;
+   }
+
+   debug_level = gotcha_atoi(debug_str);
+   if (debug_level <= 0)
+      debug_level = 1;
+
+   debug_printf(0, "Gotcha debug initialized at level %d\n", debug_level);
+}
+
+hash_table_t function_hash_table;
+hash_table_t notfound_binding_table;
+
+static hash_table_t library_table;
+static library_t *library_list = NULL;
+unsigned int current_generation;
+
+static hash_hashvalue_t link_map_hash(struct link_map *map)
+{
+   hash_hashvalue_t hashval = (hash_hashvalue_t) ((unsigned long) map);
+   hashval ^= strhash(LIB_NAME(map));
+   return hashval;
+}
+
+static int link_map_cmp(struct link_map *a, struct link_map *b)
+{
+   return ((unsigned long) a) < ((unsigned long) b);
+}
+
+static void setup_hash_tables() {
+   create_hashtable(&library_table, 128, (hash_func_t) link_map_hash, (hash_cmp_t) link_map_cmp);
+   create_hashtable(&function_hash_table, 4096, (hash_func_t) strhash, (hash_cmp_t) gotcha_strcmp);
+   create_hashtable(&notfound_binding_table, 128, (hash_func_t) strhash, (hash_cmp_t) gotcha_strcmp);    
+}
+
+struct library_t *get_library(struct link_map *map)
+{
+   library_t *lib;
+   int result;
+   result = lookup_hashtable(&library_table, (hash_key_t) map, (hash_data_t *) &lib);
+   if (result == -1)
+      return NULL;
+   return lib;
+}
+
+struct library_t *add_library(struct link_map *map)
+{
+   library_t *newlib = gotcha_malloc(sizeof(library_t));
+   newlib->map = map;
+   newlib->flags = 0;
+   newlib->generation = 0;
+   newlib->next = library_list;
+   newlib->prev = NULL;
+   if (library_list)
+      library_list->prev = newlib;
+   library_list = newlib;
+   addto_hashtable(&library_table, (hash_key_t) map, (hash_data_t) newlib);
+   return newlib;
+}
+
+void remove_library(struct link_map *map)
+{
+   library_t *lib = get_library(map);
+   if (!lib)
+      return;
+   if (lib->prev)
+      lib->prev->next = lib->next;
+   if (lib->next)
+      lib->next->prev = lib->prev;
+   if (lib == library_list)
+      library_list = library_list->next;
+   removefrom_hashtable(&library_table, (hash_key_t) map);
+   memset(lib, 0, sizeof(library_t));
+   gotcha_free(lib);
+}
+
+void gotcha_init(){
+   static int gotcha_initialized = 0;
+   if(gotcha_initialized){
+     return;
+   }
+   gotcha_initialized = 1;
+   debug_init();
+   setup_hash_tables();
+   handle_libdl();
+}
+
diff --git a/ext/GOTCHA/src/gotcha_utils.h b/ext/GOTCHA/src/gotcha_utils.h
new file mode 100644
index 000000000..85aada986
--- /dev/null
+++ b/ext/GOTCHA/src/gotcha_utils.h
@@ -0,0 +1,103 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free Software
+Foundation) version 2.1 dated February 1999.  This program is distributed in the
+hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED
+WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms
+and conditions of the GNU Lesser General Public License for more details.  You should
+have received a copy of the GNU Lesser General Public License along with this
+program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+/*!
+ ******************************************************************************
+ *
+ * \file gotcha_utils.h
+ *
+ * \brief   Header file containing the internal gotcha mechanisms
+ *          for manipulating the running process to redirect calls
+ *
+ ******************************************************************************
+ */
+#ifndef GOTCHA_UTILS_H
+#define GOTCHA_UTILS_H
+#include <sys/mman.h>
+#include "gotcha/gotcha_types.h"
+#include "hash.h"
+// TODO: remove these includes
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+// END TODO
+#include <elf.h>
+#include <link.h>
+#include <string.h>
+
+#define KNOWN_UNUSED __attribute__((unused))
+
+#define GOTCHA_DEBUG_ENV "GOTCHA_DEBUG"
+extern int debug_level;
+void gotcha_init();
+extern hash_table_t function_hash_table;
+extern hash_table_t notfound_binding_table;
+#define debug_bare_printf(lvl, format, ...)       \
+   do {                                           \
+     if (debug_level >= lvl) {                    \
+        gotcha_dbg_printf(format, ## __VA_ARGS__); \
+     }                                            \
+   } while (0);
+
+#define SHORT_FILE__ ((strrchr(__FILE__, '/') ? : __FILE__ - 1) + 1)
+
+#define debug_printf(lvl, format, ...)               \
+   do {                                              \
+     if (debug_level >= lvl) {                       \
+        gotcha_dbg_printf("[%d/%d][%s:%u] - " format, \
+               gotcha_gettid(), gotcha_getpid(),     \
+               SHORT_FILE__, __LINE__,               \
+               ## __VA_ARGS__);                      \
+     }                                               \
+   } while (0);
+
+#define error_printf(format, ...)                          \
+do {                                                       \
+     if (debug_level) {                                    \
+        gotcha_dbg_printf("ERROR [%d/%d][%s:%u] - " format, \
+               gotcha_gettid(), gotcha_getpid(),           \
+               SHORT_FILE__, __LINE__,                     \
+               ## __VA_ARGS__);                            \
+     }                                                     \
+   } while (0);
+
+#define LIB_NAME(X) (!X->l_name ? "[NULL]" : (!*X->l_name ? "[EMPTY]" : X->l_name))
+
+/*!
+ ******************************************************************************
+ * \def R_SYM(X)
+ * \brief Returns an ELF symbol which is correct for the current architecture
+ * \param X The value you wish to cast to Symbol type
+ ******************************************************************************
+ */
+#if __WORDSIZE == 64
+#define R_SYM(X) ELF64_R_SYM(X)
+#else
+#define R_SYM(X) ELF32_R_SYM(X)
+#endif
+
+/*!
+ ******************************************************************************
+ * \def BOUNDARY_BEFORE(ptr,pagesize)
+ * \brief Returns the address on page boundary before the given pointer
+ * \param ptr The address you wish to get the page boundary before
+ * \param pagesize The page size you wish to align to
+ ******************************************************************************
+ */
+#define BOUNDARY_BEFORE(ptr, pagesize) \
+  (ElfW(Addr))(((ElfW(Addr))ptr) &(-pagesize))
+
+
+#endif
diff --git a/ext/GOTCHA/src/hash.c b/ext/GOTCHA/src/hash.c
new file mode 100644
index 000000000..8a631310b
--- /dev/null
+++ b/ext/GOTCHA/src/hash.c
@@ -0,0 +1,244 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "libc_wrappers.h"
+#include "hash.h"
+
+#define EMPTY 0
+#define TOMBSTONE 1
+#define INUSE 2
+
+struct hash_entry_t {
+   hash_key_t key;
+   hash_data_t data;
+   hash_hashvalue_t hash_value;
+   struct hash_entry_t *next;
+   struct hash_entry_t *prev;
+   uint32_t status;
+};
+
+typedef struct hash_entry_t hash_entry_t;
+
+int create_hashtable(hash_table_t *table, size_t initial_size, hash_func_t hashfunc, 
+                     hash_cmp_t keycmp)
+{
+   hash_entry_t *newtable;
+   int entries_per_page;
+
+   entries_per_page = gotcha_getpagesize() / sizeof(hash_entry_t);
+   if (initial_size % entries_per_page)
+      initial_size += entries_per_page - (initial_size % entries_per_page);
+
+   newtable = (hash_entry_t *) gotcha_malloc(initial_size * sizeof(hash_entry_t));
+   if (!newtable)
+      return -1;
+   gotcha_memset(newtable, 0, initial_size * sizeof(hash_entry_t));
+
+   table->table_size = initial_size;
+   table->entry_count = 0;
+   table->hashfunc = hashfunc;
+   table->keycmp = keycmp;
+   table->table = newtable;
+   table->head = NULL;
+   
+   return 0;
+}
+
+static hash_entry_t *insert(hash_table_t *table, hash_key_t key, hash_data_t data, hash_hashvalue_t value)
+{
+   unsigned long index = (unsigned long)value % table->table_size;
+   unsigned long startindex = index;
+
+   hash_entry_t *entry = NULL;
+   do {
+      entry = table->table + index;
+      if (entry->status == EMPTY || entry->status == TOMBSTONE) {
+         entry->key = key;
+         entry->data = data;
+         entry->hash_value = value;
+         entry->status = INUSE;
+         break;
+      }
+      index++;
+      if (index == table->table_size)
+         index = 0;
+   } while (index != startindex);
+
+   if (!entry)
+      return NULL;
+
+   entry->next = table->head;
+   entry->prev = NULL;
+   if (table->head)
+      table->head->prev = entry;
+   table->head = entry;
+   table->entry_count++;         
+
+   return entry;
+}
+
+int grow_hashtable(hash_table_t *table, size_t new_size)
+{
+   hash_table_t newtable;
+   hash_entry_t *result;
+   size_t i;
+
+   newtable.table_size = new_size;
+   newtable.entry_count = 0;
+   newtable.hashfunc = table->hashfunc;
+   newtable.keycmp = table->keycmp;
+   newtable.table = (hash_entry_t *) gotcha_malloc(new_size * sizeof(hash_entry_t));
+   newtable.head = NULL;
+   gotcha_memset(newtable.table, 0, new_size * sizeof(hash_entry_t));
+
+   for (i = 0; i < table->table_size; i++) {
+      if (table->table[i].status == EMPTY || table->table[i].status == TOMBSTONE)
+         continue;
+      result = insert(&newtable, table->table[i].key, table->table[i].data,
+                      table->table[i].hash_value);
+      if (!result) {
+         return -1;
+      }
+   }
+
+   destroy_hashtable(table);
+   *table = newtable;
+   return 0;
+}
+
+int destroy_hashtable(hash_table_t *table)
+{
+   gotcha_free(table->table);
+   table->table_size = 0;
+   table->entry_count = 0;
+   table->hashfunc = NULL;
+   table->keycmp = NULL;
+   table->table = NULL;
+   table->head = NULL;
+   return 0;
+}
+
+static int lookup(hash_table_t *table, hash_key_t key, hash_entry_t **entry)
+{
+   size_t index, startindex;
+   hash_hashvalue_t hashval;
+
+   hashval = table->hashfunc(key);
+   index = hashval % table->table_size;
+   startindex = index;
+   
+   for (;;) {
+      hash_entry_t *cur = table->table + index;
+      if ((cur->status == INUSE) && 
+          (cur->hash_value == hashval) && 
+          (table->keycmp(cur->key, key) == 0)) {
+         *entry = cur;
+         return 0;
+      }
+
+      if (cur->status == EMPTY)
+         return -1;
+      index++;
+      if (index == table->table_size)
+         index = 0;
+      if (index == startindex)
+         return -1;
+   }
+}
+
+int lookup_hashtable(hash_table_t *table, hash_key_t key, hash_data_t *data)
+{
+   hash_entry_t *entry;
+   int result;
+
+   result = lookup(table, key, &entry);
+   if (result == -1)
+      return -1;
+   *data = entry->data;
+   return 0;
+}
+
+int addto_hashtable(hash_table_t *table, hash_key_t key, hash_data_t data)
+{
+   size_t newsize;
+   int result;
+   hash_hashvalue_t val;
+   hash_entry_t *entry;
+
+   newsize = table->table_size;
+   while (table->entry_count > newsize/2)
+      newsize *= 2;
+   if (newsize != table->table_size) {
+      result = grow_hashtable(table, newsize);
+      if (result == -1)
+         return -1;
+   }
+
+   val = table->hashfunc(key);
+   entry = insert(table, key, data, val);
+   if (!entry)
+      return -1;
+
+   return 0;
+}
+
+int removefrom_hashtable(hash_table_t *table, hash_key_t key)
+{
+   hash_entry_t *entry;
+   int result;
+
+   result = lookup(table, key, &entry);
+   if (result == -1)
+      return -1;
+
+   entry->key = NULL;
+   entry->data = NULL;
+   entry->hash_value = 0;
+   entry->status = TOMBSTONE;
+   if (entry->next)
+      entry->next->prev = entry->prev;
+   if (entry->prev)
+      entry->prev->next = entry->next;
+   if (table->head == entry)
+      table->head = entry->next;
+   //Do not set entry->next to NULL, which would break the iterate & delete
+   //idiom used under dlopen_wrapper.
+   
+   table->entry_count--;
+   return 0;
+}
+
+int foreach_hash_entry(hash_table_t *table, void *opaque, int (*cb)(hash_key_t key, hash_data_t data, void *opaque))
+{
+   int result;
+   struct hash_entry_t *i;
+   for (i = table->head; i != NULL; i = i->next) {
+      result = cb(i->key, i->data, opaque);
+      if (result != 0)
+         return result;
+   }
+   return 0;
+}
+
+hash_hashvalue_t strhash(const char *str)
+{
+   unsigned long hash = 5381;
+   int c;
+
+   while ((c = *str++))
+      hash = hash * 33 + c;
+
+   return (hash_hashvalue_t) hash;
+}
diff --git a/ext/GOTCHA/src/hash.h b/ext/GOTCHA/src/hash.h
new file mode 100644
index 000000000..5ac11a7e9
--- /dev/null
+++ b/ext/GOTCHA/src/hash.h
@@ -0,0 +1,52 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#if !defined(HASH_H_)
+#define HASH_H_
+
+#include <stdlib.h>
+#include <stdint.h>
+
+typedef void* hash_key_t;
+typedef void* hash_data_t;
+typedef int hash_hashvalue_t;
+typedef hash_hashvalue_t (*hash_func_t)(hash_data_t data);
+typedef int (*hash_cmp_t)(hash_key_t a, hash_key_t b);
+
+struct hash_entry_t;
+
+typedef struct 
+{
+   size_t table_size;
+   size_t entry_count;
+   hash_func_t hashfunc;
+   hash_cmp_t keycmp;
+   struct hash_entry_t *table;
+   struct hash_entry_t *head;
+} hash_table_t;
+
+int create_hashtable(hash_table_t *table, size_t initial_size, hash_func_t func, 
+                     hash_cmp_t keycmp);
+int grow_hashtable(hash_table_t *table, size_t new_size);
+int destroy_hashtable(hash_table_t *table);
+
+int lookup_hashtable(hash_table_t *table, hash_key_t key, hash_data_t *data);
+int addto_hashtable(hash_table_t *table, hash_key_t key, hash_data_t data);
+int removefrom_hashtable(hash_table_t *table, hash_key_t key);
+int foreach_hash_entry(hash_table_t *table, void *opaque, int (*cb)(hash_key_t key, hash_data_t data, void *opaque));
+
+hash_hashvalue_t strhash(const char *str);
+
+#endif
diff --git a/ext/GOTCHA/src/libc_wrappers.c b/ext/GOTCHA/src/libc_wrappers.c
new file mode 100644
index 000000000..4195480bf
--- /dev/null
+++ b/ext/GOTCHA/src/libc_wrappers.c
@@ -0,0 +1,581 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#define _GNU_SOURCE
+#define BUILDING_LIBC_WRAPPERS
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <signal.h>
+#include <sys/syscall.h>
+
+#include "libc_wrappers.h"
+#include "gotcha_auxv.h"
+
+typedef struct malloc_header_t {
+   size_t size;
+} malloc_header_t;
+
+typedef struct malloc_link_t {
+   malloc_header_t header;
+   struct malloc_link_t *next;
+} malloc_link_t;
+
+#define MIN_SIZE (sizeof(malloc_link_t) - sizeof(malloc_header_t))
+#define MIN_BLOCK_SIZE (1024*32)
+
+static malloc_link_t *free_list = NULL;
+
+static void split_allocation(malloc_link_t *allocation, size_t new_size)
+{
+   size_t orig_size = allocation->header.size;
+   malloc_link_t *newalloc;
+
+   if (orig_size - new_size <= sizeof(malloc_link_t))
+      return;
+
+   allocation->header.size = new_size;
+   newalloc = (malloc_link_t *) (((unsigned char *) &allocation->next) + new_size);
+   newalloc->header.size = (orig_size - new_size) - sizeof(malloc_header_t);
+   newalloc->next = free_list;
+   free_list = newalloc;
+}
+
+void *gotcha_malloc(size_t size)
+{
+   malloc_link_t *cur, *prev, *newalloc;
+   malloc_link_t *best_fit = NULL, *best_fit_prev;
+   // Initialization of best_fit_diff to SIZE_MAX added by Thomas Gruber to
+   // avoid warnings
+   ssize_t best_fit_diff = SIZE_MAX, diff, block_size;
+   void *result;
+
+   if (size < MIN_SIZE)
+      size = MIN_SIZE;
+   if (size % 8)
+      size += 8 - (size % 8);
+
+   //Find the tightest fit allocation in the free list
+   for (prev = NULL, cur = free_list; cur; cur = cur->next) {
+      diff = cur->header.size - size;
+      if (diff >= 0 && (!best_fit || diff < best_fit_diff)) {
+         best_fit = cur;
+         best_fit_prev = prev;
+         best_fit_diff = diff;
+         if (!diff)
+            break;
+      }
+      prev = cur;
+   }
+
+   //Removes the best fit from the free list, split if needed, and return
+   if (best_fit) {
+      if (best_fit_prev)
+         best_fit_prev->next = best_fit->next;
+      else
+         free_list = best_fit->next;
+      split_allocation(best_fit, size);
+      return (void *) &best_fit->next;
+   }
+
+   //Create a new allocation area
+   if (size + sizeof(malloc_header_t) > MIN_BLOCK_SIZE) {
+      block_size = size + sizeof(malloc_header_t);
+      diff = block_size % gotcha_getpagesize();
+      if (diff)
+         block_size += gotcha_getpagesize() - diff;
+   }
+   else {
+      block_size = MIN_BLOCK_SIZE;
+   }
+
+   result = gotcha_mmap(NULL, block_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+   if (result == MAP_FAILED)
+       return NULL;
+   newalloc = (malloc_link_t *) result;
+   newalloc->header.size = block_size - sizeof(malloc_header_t);
+   split_allocation(newalloc, size);
+   return (void *) &newalloc->next;
+}
+
+void *gotcha_realloc(void* buffer, size_t size)
+{
+   void *newbuffer;
+   malloc_link_t *alloc;
+
+   alloc = (malloc_link_t *) (((malloc_header_t *) buffer) - 1);
+
+   if (size <= alloc->header.size)
+      return buffer;
+
+   newbuffer = gotcha_malloc(size);
+   if (!newbuffer)
+      return NULL;
+   gotcha_memcpy(newbuffer, buffer, alloc->header.size);
+
+   gotcha_free(buffer);
+
+   return newbuffer;
+}
+
+void gotcha_free(void *buffer)
+{
+   malloc_link_t *alloc;
+   alloc = (malloc_link_t *) (((malloc_header_t *) buffer) - 1);
+
+   alloc->next = free_list;
+   free_list = alloc;
+}
+
+void gotcha_memcpy(void *dest, void *src, size_t size)
+{
+   size_t i;
+   for (i = 0; i < size; i++) {
+      ((unsigned char *) dest)[i] = ((unsigned char *) src)[i];
+   }
+}
+
+int gotcha_strncmp(const char *in_one, const char *in_two, int max_length)
+{
+  int i = 0;
+  for (; i < max_length; i++) {
+    if (in_one[i] == '\0') {
+      return (in_two[i] == '\0') ? 0 : 1;
+    }
+    if (in_one[i] != in_two[i]) {
+      return in_one[i] - in_two[i];
+    }
+  }
+  return 0;
+}
+
+int gotcha_strcmp(const char *in_one, const char *in_two)
+{
+  int i = 0;
+  for (;; i++) {
+    if (in_one[i] == '\0') {
+      return (in_two[i] == '\0') ? 0 : 1;
+    }
+    if (in_one[i] != in_two[i]) {
+      return in_one[i] - in_two[i];
+    }
+  }
+}
+
+char *gotcha_strstr(const char *searchIn, const char *searchFor)
+{
+   int i, j;
+   if (!searchFor[0])
+      return NULL;
+
+   for (i = 0; searchIn[i]; i++) {
+      if (searchIn[i] != searchFor[0])
+         continue;
+      for (j = 1; ; j++) {
+         if (!searchFor[j])
+            return (char*)(searchFor + i);
+         if (!searchIn[i+j])
+            return NULL;
+         if (searchFor[j] != searchIn[i+j])
+            break;
+      }
+   }
+   return NULL;
+}
+
+ssize_t gotcha_write(int fd, const void *buf, size_t count)
+{
+   return syscall(SYS_write, fd, buf, count);
+}
+
+size_t gotcha_strlen(const char *s)
+{
+   size_t i;
+   for (i = 0; s[i]; i++);
+   return i;
+}
+
+size_t gotcha_strnlen(const char *s, size_t max_length)
+{
+   size_t i;
+   for (i = 0; s[i] && i<max_length; i++);
+   return i;
+}
+
+static int ulong_to_hexstr(unsigned long num, char *str, int strlen, int uppercase)
+{
+   int len, i;
+   unsigned long val;
+   char base_char = uppercase ? 'A' : 'a';
+
+   if (num == 0) {
+      if (strlen < 2)
+         return -1;
+      str[0] = '0';
+      str[1] = '\0';
+      return 1;
+   }
+
+   for (len = 0, val = num; val; val = val / 16, len++);
+   if (len + 1 >= strlen)
+      return -1;
+
+   str[len] = '\0';
+   val = num;
+   for (i = 1; i <= len; i++) {
+
+      str[len - i] = (val % 16 <= 9) ? ('0' + val % 16) : (base_char + (val % 16 - 10));
+      val = val / 16;
+   }
+   return len;
+}
+
+static int ulong_to_str(unsigned long num, char *str, int strlen)
+{
+   int len, i;
+   unsigned long val;
+
+   if (num == 0) {
+      if (strlen < 2)
+         return -1;
+      str[0] = '0';
+      str[1] = '\0';
+      return 1;
+   }
+
+   for (len = 0, val = num; val; val = val / 10, len++);
+   if (len + 1 >= strlen)
+      return -1;
+
+   str[len] = '\0';
+   val = num;
+   for (i = 1; i <= len; i++) {
+      str[len - i] = '0' + val % 10;
+      val = val / 10;
+   }
+   return len;
+}
+
+static int slong_to_str(signed long num, char *str, int strlen)
+{
+   int result;
+   if (num >= 0)
+      return ulong_to_str((unsigned long) num, str, strlen);
+
+   result = ulong_to_str((unsigned long) (num * -1), str+1, strlen-1);
+   if (result == -1)
+      return -1;
+   str[0] = '-';
+   return result + 1;
+}
+
+void gotcha_assert_fail(const char *s, const char *file, unsigned int line, const char *function)
+{
+   char linestr[64];
+   int result;
+
+   result = ulong_to_str(line, linestr, sizeof(linestr)-1);
+   if (result == -1)
+      linestr[0] = '\0';
+
+   gotcha_write(2, file, gotcha_strlen(file));
+   gotcha_write(2, ":", 1);
+   gotcha_write(2, linestr, gotcha_strlen(linestr));
+   gotcha_write(2, ": ", 2);
+   gotcha_write(2, function, gotcha_strlen(function));
+   gotcha_write(2, ": Assertion `", 13);
+   gotcha_write(2, s, gotcha_strlen(s));
+   gotcha_write(2, "' failed.\n", 10);
+   syscall(SYS_kill, gotcha_getpid(), SIGABRT);
+}
+
+extern char **__environ;
+char *gotcha_getenv(const char *name)
+{
+   char **s;
+   int name_len;
+
+   name_len = gotcha_strlen(name);
+   for (s = __environ; *s; s++) {
+      if (gotcha_strncmp(name, *s, name_len) != 0)
+         continue;
+      if ((*s)[name_len] != '=')
+         continue;
+      return (*s) + name_len + 1;
+   }
+   return NULL;
+}
+
+pid_t gotcha_getpid()
+{
+   return syscall(SYS_getpid);
+}
+
+pid_t gotcha_gettid()
+{
+   return syscall(SYS_gettid);
+}
+
+unsigned int gotcha_getpagesize()
+{
+   static unsigned int pagesz = 0;
+   if (pagesz)
+      return pagesz;
+
+   pagesz = get_auxv_pagesize();
+   if (!pagesz)
+      pagesz = 4096;
+   return pagesz;
+}
+
+int gotcha_open(const char *pathname, int flags, ...)
+{
+   mode_t mode;
+   va_list args;
+   long result;
+
+   va_start(args, flags);
+   if (flags & O_CREAT) {
+      mode = va_arg(args, mode_t);
+   }
+   else {
+      mode = 0;
+   }
+   va_end(args);
+
+   result = syscall(SYS_open, pathname, flags, mode);
+   if (result >= 0)
+      return (int) result;
+
+   return -1;
+}
+
+void *gotcha_mmap(void *addr, size_t length, int prot, int flags,
+                  int fd, off_t offset)
+{
+   long result;
+
+   result = syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
+   return (void *) result;
+}
+
+int gotcha_atoi(const char *nptr)
+{
+   int neg = 1, len, val = 0, mult = 1;
+   const char *cur;
+
+   while (*nptr == '-') {
+      neg = neg * -1;
+      nptr++;
+   }
+
+   for (len = 0; nptr[len] >= '0' && nptr[len] <= '9'; len++);
+
+   for (cur = nptr + len - 1; cur != nptr-1; cur--) {
+      val += mult * (*cur - '0');
+      mult *= 10;
+   }
+
+   return val * neg;
+}
+
+int gotcha_close(int fd)
+{
+   return syscall(SYS_close, fd);
+}
+
+int gotcha_mprotect(void *addr, size_t len, int prot)
+{
+   return syscall(SYS_mprotect, addr, len, prot);
+}
+
+ssize_t gotcha_read(int fd, void *buf, size_t count)
+{
+   return syscall(SYS_read, fd, buf, count);
+}
+
+static const char *add_to_buffer(const char *str, int fd, int *pos, char *buffer,
+                                 int buffer_size, int *num_printed, int print_percent)
+{
+   for (; *str && (print_percent || *str != '%'); str++) {
+      if (*pos >= buffer_size) {
+         gotcha_write(fd, buffer, buffer_size);
+         *num_printed += buffer_size;
+         *pos = 0;
+      }
+      else {
+         buffer[*pos] = *str;
+      }
+      *pos = *pos + 1;
+   }
+   return str;
+}
+
+int gotcha_int_printf(int fd, const char *format, ...)
+{
+#define inc(S) do { S++; if (*(S) == '\0') goto done; } while(0)
+   va_list args;
+   const char *str = format;
+   int buffer_pos = 0;
+   int char_width, short_width, long_width, long_long_width, size_width;
+   int num_printed = 0;
+   char buffer[4096];
+
+   va_start(args, format);
+   while (*str) {
+      str = add_to_buffer(str, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 0);
+      if (!*str) break;
+
+      gotcha_assert(*str == '%');
+      inc(str);
+
+      char_width = short_width = long_width = long_long_width = size_width = 0;
+      if (*str == 'h' && *(str+1) == 'h') {
+         char_width = 1;
+         inc(str);
+         inc(str);
+      }
+      else if (*str == 'h') {
+         short_width = 1;
+         inc(str);
+      }
+      else if (*str == 'l' && *(str+1) == 'l') {
+         long_long_width = 1;
+         inc(str);
+         inc(str);
+      }
+      else if (*str == 'l') {
+         long_width = 1;
+         inc(str);
+      }
+      else if (*str == 'z') {
+         size_width = 1;
+         inc(str);
+      }
+
+      if (*str == 'd' || *str == 'i') {
+         signed long val;
+         char numstr[64];
+         if (char_width)
+            val = (signed long) (signed char) va_arg(args, signed int);
+         else if (short_width)
+            val = (signed long) (signed short) va_arg(args, signed int);
+         else if (long_width)
+            val = (signed long) va_arg(args, signed long);
+         else if (long_long_width)
+            val = (signed long) va_arg(args, signed long long);
+         else if (size_width)
+            val = (signed long) va_arg(args, ssize_t);
+         else
+            val = (signed long) va_arg(args, signed int);
+         slong_to_str(val, numstr, 64);
+         add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else if (*str == 'u') {
+         unsigned long val;
+         char numstr[64];
+         if (char_width)
+            val = (unsigned long) (unsigned char) va_arg(args, unsigned int);
+         else if (short_width)
+            val = (unsigned long) (unsigned short) va_arg(args, unsigned int);
+         else if (long_width)
+            val = (unsigned long) va_arg(args, unsigned long);
+         else if (long_long_width)
+            val = (unsigned long) va_arg(args, unsigned long long);
+         else if (size_width)
+            val = (unsigned long) va_arg(args, ssize_t);
+         else
+            val = (unsigned long) va_arg(args, unsigned int);
+         ulong_to_str(val, numstr, 64);
+         add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else if (*str == 'x' || *str == 'X' || *str == 'p') {
+         unsigned long val;
+         char numstr[64];
+         if (*str != 'p') {
+            if (char_width)
+               val = (unsigned long) (unsigned char) va_arg(args, unsigned int);
+            else if (short_width)
+               val = (unsigned long) (unsigned short) va_arg(args, unsigned int);
+            else if (long_width)
+               val = (unsigned long) va_arg(args, unsigned long);
+            else if (long_long_width)
+               val = (unsigned long) va_arg(args, unsigned long long);
+            else if (size_width)
+               val = (unsigned long) va_arg(args, ssize_t);
+            else
+               val = (unsigned long) va_arg(args, unsigned int);
+         }
+         else {
+            val = (unsigned long) va_arg(args, void *);
+            add_to_buffer("0x", fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+         }
+         ulong_to_hexstr(val, numstr, 64, (*str == 'X'));
+         add_to_buffer(numstr, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else if (*str == 'c') {
+         char cbuf[2];
+         cbuf[0] = (unsigned char) va_arg(args, unsigned int);
+         cbuf[1] = '\0';
+         add_to_buffer(cbuf, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else if (*str == 's') {
+         char *s = (char *) va_arg(args, char *);
+         add_to_buffer(s, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else if (*str == '%') {
+         add_to_buffer("%", fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      else {
+         char s[3];
+         s[0] = '%';
+         s[1] = *str;
+         s[2] = '\0';
+         add_to_buffer(s, fd, &buffer_pos, buffer, sizeof(buffer), &num_printed, 1);
+      }
+      inc(str);
+   }
+
+  done:
+   gotcha_write(fd, buffer, buffer_pos);
+   num_printed += buffer_pos;
+   va_end(args);
+   return num_printed;
+}
+
+void *gotcha_memset(void *s, int c, size_t n)
+{
+   size_t i;
+   unsigned char byte = (unsigned char) c;
+   for (i = 0; i < n; i++) {
+      ((unsigned char *) s)[i] = byte;
+   }
+   return s;
+}
+
+char* gotcha_strncat(char* dest, const char* src, size_t n){
+  char* dest_begin = dest;
+  dest = dest + gotcha_strlen(dest);
+  size_t dest_stop = gotcha_strnlen(src, n);
+  dest[dest_stop] = '\0';
+  gotcha_memcpy(dest, (void *) src, n);
+  return dest_begin;
+}
diff --git a/ext/GOTCHA/src/libc_wrappers.h b/ext/GOTCHA/src/libc_wrappers.h
new file mode 100644
index 000000000..d3b19b163
--- /dev/null
+++ b/ext/GOTCHA/src/libc_wrappers.h
@@ -0,0 +1,98 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#if !defined(LIBC_WRAPPERS_H_)
+#define LIBC_WRAPPERS_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#ifndef FORCE_NO_LIBC
+#define GOTCHA_USE_LIBC
+#endif
+
+#if defined(GOTCHA_USE_LIBC) && !defined(BUILDING_LIBC_WRAPPERS)
+
+#define gotcha_malloc             malloc
+#define gotcha_realloc            realloc
+#define gotcha_free               free
+#define gotcha_memcpy             memcpy
+#define gotcha_strncmp            strncmp
+#define gotcha_strstr             strstr
+#define gotcha_assert             assert
+#define gotcha_strcmp             strcmp
+#define gotcha_getenv             getenv
+#define gotcha_getpid             getpid
+#define gotcha_getpagesize        getpagesize
+#define gotcha_open               open
+#define gotcha_mmap               mmap
+#define gotcha_atoi               atoi
+#define gotcha_close              close
+#define gotcha_mprotect           mprotect
+#define gotcha_read               read
+#define gotcha_memset             memset
+#define gotcha_write              write
+#define gotcha_strlen             strlen
+#define gotcha_strnlen            strnlen
+#define gotcha_strtok             strtok
+#define gotcha_strncat            strncat
+#define gotcha_dbg_printf(A, ...) fprintf(stderr, A, ##__VA_ARGS__)
+pid_t gotcha_gettid();            //No libc gettid, always use gotcha version
+
+#else
+
+void *gotcha_malloc(size_t size);
+void *gotcha_realloc(void* buffer, size_t size);
+void gotcha_free(void* free_me);
+void gotcha_memcpy(void* dest, void* src, size_t size);
+int gotcha_strncmp(const char* in_one, const char* in_two, int max_length);
+char *gotcha_strstr(const char* searchIn,const char* searchFor);
+int gotcha_strcmp(const char* in_one, const char* in_two);
+char *gotcha_getenv(const char *env);
+pid_t gotcha_getpid();
+pid_t gotcha_gettid();
+unsigned int gotcha_getpagesize();
+int gotcha_open(const char *pathname, int flags, ...);
+void *gotcha_mmap(void *addr, size_t length, int prot, int flags,
+                  int fd, off_t offset);
+int gotcha_atoi(const char *nptr);
+int gotcha_close(int fd);
+int gotcha_mprotect(void *addr, size_t len, int prot);
+ssize_t gotcha_read(int fd, void *buf, size_t count);
+ssize_t gotcha_write(int fd, const void *buf, size_t count);
+void gotcha_assert_fail(const char *s, const char *file, unsigned int line, const char *function);
+void *gotcha_memset(void *s, int c, size_t n);
+size_t gotcha_strlen(const char* str);
+size_t gotcha_strnlen(const char* str, size_t max_length);
+char* gotcha_strncat(char* dest, const char* src, size_t n);
+char* gotcha_strtok(char* dest, const char* src, size_t n);
+
+#define gotcha_dbg_printf(FORMAT, ...) gotcha_int_printf(2, FORMAT, ##__VA_ARGS__)
+
+#define gotcha_assert(A)                                          \
+   do {                                                           \
+      if (! (A) )                                                 \
+         gotcha_assert_fail("" #A, __FILE__, __LINE__, __func__); \
+   } while (0); 
+
+#endif
+
+int gotcha_int_printf(int fd, const char *format, ...);
+
+#endif
diff --git a/ext/GOTCHA/src/library_filters.c b/ext/GOTCHA/src/library_filters.c
new file mode 100644
index 000000000..028b53808
--- /dev/null
+++ b/ext/GOTCHA/src/library_filters.c
@@ -0,0 +1,47 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include "library_filters.h"
+#include "libc_wrappers.h"
+
+static const char* filter;
+int (*libraryFilterFunc)(struct link_map*) = alwaysTrue;
+
+int alwaysTrue(struct link_map* candidate KNOWN_UNUSED){
+  return 1;
+}
+
+int trueIfNameMatches(struct link_map* target){
+  int match = (filter) && (target) && (gotcha_strstr(target->l_name, filter) != 0);
+  return match;
+}
+int trueIfLast(struct link_map* target){
+  int ret = (target->l_next) ? 0 : 1;
+  return ret;
+}
+void onlyFilterLast(){
+  setLibraryFilterFunc(trueIfLast);
+}
+void setLibraryFilterFunc(int(*new_func)(struct link_map*)){
+  libraryFilterFunc = new_func;
+}
+void restoreLibraryFilterFunc(){
+  setLibraryFilterFunc(alwaysTrue);
+}
+
+void filterLibrariesByName(const char* nameFilter){
+  filter = nameFilter;
+  setLibraryFilterFunc(trueIfNameMatches);
+}
+
diff --git a/ext/GOTCHA/src/library_filters.h b/ext/GOTCHA/src/library_filters.h
new file mode 100644
index 000000000..34dea03bd
--- /dev/null
+++ b/ext/GOTCHA/src/library_filters.h
@@ -0,0 +1,32 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+// TODO: Determine whether this interface should stay on in this form
+
+#ifndef GOTCHA_LIBRARY_FILTERS_H
+#define GOTCHA_LIBRARY_FILTERS_H
+#include <link.h>
+#include "gotcha_utils.h"
+
+int alwaysTrue(struct link_map* candidate KNOWN_UNUSED);
+extern int (*libraryFilterFunc)(struct link_map*);
+
+int trueIfNameMatches(struct link_map* target);
+int trueIfLast(struct link_map* target);
+void filterLibrariesByName(const char* nameFilter);
+void onlyFilterLast();
+void setLibraryFilterFunc(int(*new_func)(struct link_map*));
+void restoreLibraryFilterFunc();
+#endif
diff --git a/ext/GOTCHA/src/tool.c b/ext/GOTCHA/src/tool.c
new file mode 100644
index 000000000..1f43b5eb7
--- /dev/null
+++ b/ext/GOTCHA/src/tool.c
@@ -0,0 +1,198 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "tool.h"
+#include "libc_wrappers.h"
+#include "gotcha_utils.h"
+
+static tool_t *tools = NULL;
+static binding_t *all_bindings = NULL;
+
+tool_t* get_tool_list(){
+  return tools;
+}
+
+int tool_equal(tool_t* t1, tool_t* t2){
+  return gotcha_strcmp(t1->tool_name,t2->tool_name);
+}
+
+void remove_tool_from_list(struct tool_t* target){
+     if(!tools){
+        return;
+     }
+     if(!tool_equal(tools,target)){
+        tools = tools->next_tool;
+        return;
+     }
+     struct tool_t *cur = tools;
+     while( (cur!=NULL) && (cur->next_tool != NULL) && (tool_equal(cur->next_tool,target))){
+        cur = cur->next_tool;
+     }
+     if(!tool_equal(cur->next_tool,target)){
+        cur->next_tool = target->next_tool; 
+     }
+}
+
+void reorder_tool(tool_t* new_tool) {
+  int new_priority = new_tool->config.priority;
+  if(tools==NULL || tools->config.priority >= new_priority ){
+     new_tool->next_tool = tools;
+     tools = new_tool;
+  }
+  else{
+     struct tool_t *cur = tools;
+     while((cur->next_tool != NULL) && cur->next_tool->config.priority < new_priority){
+        cur = cur->next_tool;
+     }
+     new_tool->next_tool = cur->next_tool;
+     cur->next_tool = new_tool;
+  }
+}
+
+tool_t *create_tool(const char *tool_name)
+{
+   debug_printf(1, "Found no existing tool with name %s\n",tool_name);
+   // TODO: ensure free
+   tool_t *newtool = (tool_t *) gotcha_malloc(sizeof(tool_t));
+   if (!newtool) {
+      error_printf("Failed to malloc tool %s\n", tool_name);
+      return NULL;
+   }
+   newtool->tool_name = tool_name;
+   newtool->binding = NULL;
+   //newtool->next_tool = tools;
+   newtool->config = get_default_configuration();
+   reorder_tool(newtool);
+   newtool->parent_tool = NULL;
+   create_hashtable(&newtool->child_tools, 24, 
+     (hash_func_t) strhash, (hash_cmp_t) gotcha_strcmp);
+   //tools = newtool;
+   debug_printf(1, "Created new tool %s\n", tool_name);
+   return newtool;
+}
+
+tool_t *get_tool(const char *tool_name)
+{
+   tool_t *t;
+   for (t = tools; t; t = t->next_tool) {
+      if (gotcha_strcmp(tool_name, t->tool_name) == 0) {
+         return t;
+      }
+   }
+   return NULL;
+}
+
+binding_t *add_binding_to_tool(tool_t *tool, struct gotcha_binding_t *user_binding, int user_binding_size)
+{
+   binding_t *newbinding;
+   int result, i;
+   newbinding = (binding_t *) gotcha_malloc(sizeof(binding_t));
+   newbinding->tool = tool;
+   struct internal_binding_t* internal_bindings = (struct internal_binding_t*)gotcha_malloc(sizeof(struct internal_binding_t)*user_binding_size);
+   for(i=0;i<user_binding_size;i++){
+      internal_bindings[i].user_binding = &user_binding[i];
+      *(user_binding[i].function_handle) = &internal_bindings[i];
+      internal_bindings[i].associated_binding_table = newbinding;
+   }  
+   newbinding->internal_bindings = internal_bindings;
+   newbinding->internal_bindings_size = user_binding_size;
+   result = create_hashtable(&newbinding->binding_hash, user_binding_size * 2, 
+                             (hash_func_t) strhash, (hash_cmp_t) gotcha_strcmp);
+   if (result != 0) {
+      error_printf("Could not create hash table for %s\n", tool->tool_name);
+      goto error; // error is a label which frees allocated resources and returns NULL
+   }
+
+   for (i = 0; i < user_binding_size; i++) {
+      result = addto_hashtable(&newbinding->binding_hash, (void *) user_binding[i].name,
+                               (void *) (internal_bindings + i));
+      if (result != 0) {
+         error_printf("Could not add hash entry for %s to table for tool %s\n", 
+                      user_binding[i].name, tool->tool_name);
+         goto error; // error is a label which frees allocated resources and returns NULL
+      }
+   }
+
+   newbinding->next_tool_binding = tool->binding;
+   tool->binding = newbinding;
+
+   newbinding->next_binding = all_bindings;
+   all_bindings = newbinding;
+
+   debug_printf(2, "Created new binding table of size %d for tool %s\n", user_binding_size, tool->tool_name);
+   return newbinding;
+
+  error:
+   if (newbinding)
+      gotcha_free(newbinding);
+   return NULL;
+}
+
+binding_t *get_bindings()
+{
+   return all_bindings;
+}
+
+binding_t *get_tool_bindings(tool_t *tool)
+{
+   return tool->binding;
+}
+
+struct gotcha_configuration_t get_default_configuration(){
+  struct gotcha_configuration_t result;
+  result.priority = UNSET_PRIORITY;
+  return result;
+}
+
+enum gotcha_error_t get_default_configuration_value(enum gotcha_config_key_t key, void* data){
+  struct gotcha_configuration_t config = get_default_configuration();
+  if(key==GOTCHA_PRIORITY){
+    *((int*)(data)) = config.priority; 
+  }
+  return GOTCHA_SUCCESS;
+
+}
+
+enum gotcha_error_t get_configuration_value(const char* tool_name, enum gotcha_config_key_t key, void* location_to_store_result){
+  struct tool_t* tool = get_tool(tool_name);
+  if(tool==NULL){
+     error_printf("Property being examined for nonexistent tool %s\n", tool_name);
+     return GOTCHA_INVALID_TOOL;
+  }
+  get_default_configuration_value(key, location_to_store_result);
+  int found_valid_value = 0;
+  while( (tool!=NULL) && !(found_valid_value) ){
+    struct gotcha_configuration_t config = tool->config;
+    if(key==GOTCHA_PRIORITY){
+      int current_priority = config.priority;
+      if(current_priority!=UNSET_PRIORITY){
+        *((int*)(location_to_store_result)) = config.priority; 
+        found_valid_value = 1;
+        return GOTCHA_SUCCESS;
+      }
+    }
+    else{
+      error_printf("Invalid property being configured on tool %s\n", tool_name);
+      return GOTCHA_INTERNAL;
+    }
+    tool = tool->parent_tool;
+  }
+  return GOTCHA_SUCCESS;  
+}
+
+int get_priority(tool_t *tool)
+{
+   return tool->config.priority;
+}
diff --git a/ext/GOTCHA/src/tool.h b/ext/GOTCHA/src/tool.h
new file mode 100644
index 000000000..3fc1daeed
--- /dev/null
+++ b/ext/GOTCHA/src/tool.h
@@ -0,0 +1,106 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#if !defined(TOOL_H_)
+#define TOOL_H_
+
+#include "gotcha/gotcha.h"
+#include "gotcha/gotcha_types.h"
+#include "hash.h"
+
+struct tool_t;
+
+#define UNSET_PRIORITY (-1)
+
+enum gotcha_config_key_t {
+  GOTCHA_PRIORITY
+};
+
+/**
+ * A structure representing how a given tool's bindings are configured
+ */
+struct gotcha_configuration_t {
+  int priority;
+};
+
+/**
+ * A per-library structure
+ **/
+#define LIB_GOT_MARKED_WRITEABLE (1 << 0)
+#define LIB_PRESENT              (1 << 1)
+typedef struct library_t {
+   struct link_map *map;
+   struct library_t *next;
+   struct library_t *prev;
+   unsigned int generation;
+   int flags;
+} library_t;
+struct library_t *get_library(struct link_map *map);
+struct library_t *add_library(struct link_map *map);
+void remove_library(struct link_map *map);
+extern unsigned int current_generation;
+   
+/**
+ * The internal structure that matches the external gotcha_binding_t.
+ * In addition to the data specified in the gotcha_binding_t, we add:
+ * - a linked-list pointer to the next binding table for this tool
+ * - a linked-list pointer to the next binding table
+ **/
+typedef struct binding_t {
+   struct tool_t *tool;
+   struct internal_binding_t *internal_bindings;
+   int internal_bindings_size;
+   hash_table_t binding_hash;
+   struct binding_t *next_tool_binding;
+   struct binding_t *next_binding;
+} binding_t;
+
+/**
+ * A structure for representing tools. Once we support stacking multiple
+ * tools this will become more important.
+ **/
+typedef struct tool_t {
+   const char *tool_name;
+   binding_t *binding;
+   struct tool_t *next_tool;
+   struct gotcha_configuration_t config;
+   hash_table_t child_tools;
+   struct tool_t * parent_tool;
+} tool_t;
+
+struct internal_binding_t {
+  struct binding_t* associated_binding_table;
+  struct gotcha_binding_t* user_binding;
+  struct internal_binding_t* next_binding;
+  void* wrappee_pointer;
+};
+
+tool_t *create_tool(const char *tool_name);
+tool_t *get_tool(const char *tool_name);
+tool_t *get_tool_list();
+void reorder_tool(tool_t* new_tool);
+void remove_tool_from_list(struct tool_t* target);
+void print_tools();
+
+binding_t *add_binding_to_tool(tool_t *tool, struct gotcha_binding_t *user_binding, int user_binding_size);
+binding_t *get_bindings();
+binding_t *get_tool_bindings(tool_t *tool);
+
+struct gotcha_configuration_t get_default_configuration();
+enum gotcha_error_t get_configuration_value(const char* tool_name, enum gotcha_config_key_t key, void* location_to_store_result);
+int get_priority(tool_t *tool);
+int tool_equal(tool_t* tool_1, tool_t* tool_2);
+
+#endif
diff --git a/ext/GOTCHA/src/translations.c b/ext/GOTCHA/src/translations.c
new file mode 100644
index 000000000..ff88e3fc6
--- /dev/null
+++ b/ext/GOTCHA/src/translations.c
@@ -0,0 +1,38 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <gotcha/gotcha.h>
+#include "translations.h"
+#include "gotcha_utils.h"
+
+int main_wrapped;
+gotcha_wrappee_handle_t gotcha_internal_libc_main_wrappee_handle;
+gotcha_wrappee_handle_t gotcha_internal_main_wrappee_handle;
+
+int gotcha_internal_main(int argc, char** argv, char** envp){
+  main_t underlying_main = gotcha_get_wrappee(gotcha_internal_main_wrappee_handle); 
+  return underlying_main(argc, argv, envp);
+}
+int gotcha_internal_libc_start_main(int (*main_arg)(int, char**, char**) KNOWN_UNUSED, int argc, char** argv, void (*init)(), void (*fini)(), void (*rtld_fini)(), void* stack_end){
+   libc_start_main_t underlying_libc_main = gotcha_get_wrappee(gotcha_internal_libc_main_wrappee_handle);
+   main_t underlying_main = gotcha_get_wrappee(gotcha_internal_main_wrappee_handle);
+   return underlying_libc_main(underlying_main, argc, argv, init, fini, rtld_fini, stack_end);
+}
+
+struct gotcha_binding_t libc_main_wrappers[] = {
+  {"__libc_start_main", gotcha_internal_libc_start_main, &gotcha_internal_libc_main_wrappee_handle}
+};
+struct gotcha_binding_t main_wrappers[] = {
+  {"main", gotcha_internal_main, &gotcha_internal_main_wrappee_handle}
+};
diff --git a/ext/GOTCHA/src/translations.h b/ext/GOTCHA/src/translations.h
new file mode 100644
index 000000000..faa0b25cb
--- /dev/null
+++ b/ext/GOTCHA/src/translations.h
@@ -0,0 +1,37 @@
+/*
+This file is part of GOTCHA.  For copyright information see the COPYRIGHT
+file in the top level directory, or at
+https://github.com/LLNL/gotcha/blob/master/COPYRIGHT
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License (as published by the Free
+Software Foundation) version 2.1 dated February 1999.  This program is
+distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the IMPLIED WARRANTY OF MERCHANTABILITY or FITNESS FOR A PARTICULAR
+PURPOSE. See the terms and conditions of the GNU Lesser General Public License
+for more details.  You should have received a copy of the GNU Lesser General
+Public License along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/** 
+ * This file contains utilities for cases where users say something (wrap main) 
+ * that doesn't work, but can be translated to something that does work
+ */
+#ifndef GOTCHA_SRC_TRANSLATIONS_H
+#define GOTCHA_SRC_TRANSLATIONS_H
+#include <gotcha/gotcha.h>
+
+/** "int main" wrapping handling */
+typedef int (*libc_start_main_t) (int (*)(int, char**, char**), int, char**, void (*)(), void (*)(), void (*)(), void*);
+typedef int (*main_t)            (int argc, char** argv, char** envp);
+
+extern int main_wrapped;
+extern gotcha_wrappee_handle_t gotcha_internal_libc_main_wrappee_handle;
+extern gotcha_wrappee_handle_t gotcha_internal_main_wrappee_handle;
+
+int gotcha_internal_main(int argc, char** argv, char** envp);
+int gotcha_internal_libc_start_main (int (*)(int, char**, char**), int, char**, void (*)(), void (*)(), void (*)(), void*);
+
+extern struct gotcha_binding_t libc_main_wrappers[];
+extern struct gotcha_binding_t main_wrappers[];
+
+#endif
diff --git a/ext/hwloc/Makefile b/ext/hwloc/Makefile
index 003d1607f..a980ca698 100644
--- a/ext/hwloc/Makefile
+++ b/ext/hwloc/Makefile
@@ -14,7 +14,7 @@ INCLUDES  += -I./include
 LIBS      = -L. -lm
 LFLAGS    = -fPIC -fvisibility=hidden
 Q         ?= @
-DEFINES := $(filter-out -DVERSION=$(VERSION),$(DEFINES))
+DEFINES := $(filter-out -DVERSION=$(VERSION),$(DEFINES)) -DRUNSTATEDIR=\"/var/run\"
 ifeq ($(DEBUG),true)
 DEBUG_FLAGS = -g
 else
@@ -25,6 +25,7 @@ CFLAGS += -mmic
 LFLAGS += -mmic
 endif
 
+
 #CONFIGURE BUILD SYSTEM
 BUILD_DIR  = ./$(COMPILER)
 
@@ -32,12 +33,18 @@ VPATH     = $(SRC_DIRS)
 FILES     = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
 OBJ       = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
 LIBHWLOC = $(shell basename $(TARGET_HWLOC_LIB))
-ifeq ($(COMPILER), GCCARMv7)
+ifeq ($(strip $(COMPILER)), GCCARMv7)
 OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o, $(OBJ))
 endif
-ifeq ($(COMPILER), GCCARMv8)
+ifeq ($(strip $(COMPILER)), GCCARMv8)
 OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o, $(OBJ))
 endif
+ifeq ($(COMPILER),GCCPOWER)
+OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o,$(OBJ))
+endif
+ifeq ($(COMPILER),XLC)
+OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o,$(OBJ))
+endif
 
 
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
diff --git a/ext/hwloc/hwloc/base64.c b/ext/hwloc/hwloc/base64.c
index 7a3392fab..4df67bf97 100644
--- a/ext/hwloc/hwloc/base64.c
+++ b/ext/hwloc/hwloc/base64.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012 Inria.  All rights reserved.
+ * Copyright © 2012-2018 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  *
  * Modifications after import:
@@ -8,6 +8,11 @@
  * - updated #include
  */
 
+/* include hwloc's config before anything else
+ * so that extensions and features are properly enabled
+ */
+#include "private/private.h"
+
 /*	$OpenBSD: base64.c,v 1.5 2006/10/21 09:55:03 otto Exp $	*/
 
 /*
@@ -125,8 +130,6 @@ static const char Pad64 = '=';
 #include <string.h>
 #include <ctype.h>
 
-#include <private/private.h>
-
 int
 hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize)
 {
@@ -178,7 +181,7 @@ hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t t
 	if (datalength >= targsize)
 		return (-1);
 	target[datalength] = '\0';	/* Returned value doesn't count \0. */
-	return (datalength);
+	return (int)(datalength);
 }
 
 /* skips all whitespace anywhere.
@@ -213,7 +216,7 @@ hwloc_decode_from_base64(char const *src, char *target, size_t targsize)
 			if (target) {
 				if (tarindex >= targsize)
 					return (-1);
-				target[tarindex] = (pos - Base64) << 2;
+				target[tarindex] = (char)(pos - Base64) << 2;
 			}
 			state = 1;
 			break;
diff --git a/ext/hwloc/hwloc/bind.c b/ext/hwloc/hwloc/bind.c
index e2b5a063e..0bd85e258 100644
--- a/ext/hwloc/hwloc/bind.c
+++ b/ext/hwloc/hwloc/bind.c
@@ -1,15 +1,16 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2011 inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2010, 2012 Université Bordeaux
- * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2011-2015 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <hwloc/helper.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "hwloc/helper.h"
+
 #ifdef HAVE_SYS_MMAN_H
 #  include <sys/mman.h>
 #endif
@@ -23,27 +24,21 @@
 #include <stdlib.h>
 #include <errno.h>
 
-/* TODO: HWLOC_GNU_SYS, HWLOC_IRIX_SYS,
- *
- * IRIX: see MP_MUSTRUN / _DSM_MUSTRUN, pthread_setrunon_np, /hw, procss_cpulink, numa_create
+/* TODO: HWLOC_GNU_SYS,
  *
  * We could use glibc's sched_setaffinity generically when it is available
  *
  * Darwin and OpenBSD don't seem to have binding facilities.
  */
 
+#define HWLOC_CPUBIND_ALLFLAGS (HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT|HWLOC_CPUBIND_NOMEMBIND)
+
 static hwloc_const_bitmap_t
 hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
 {
   hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
   hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
 
-  if (!topology_set) {
-    /* The topology is composed of several systems, the cpuset is ambiguous. */
-    errno = EXDEV;
-    return NULL;
-  }
-
   if (hwloc_bitmap_iszero(set)) {
     errno = EINVAL;
     return NULL;
@@ -63,6 +58,11 @@ hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
 int
 hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   set = hwloc_fix_cpubind(topology, set);
   if (!set)
     return -1;
@@ -74,9 +74,13 @@ hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags
     if (topology->binding_hooks.set_thisthread_cpubind)
       return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
   } else {
-    if (topology->binding_hooks.set_thisproc_cpubind)
-      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
-    else if (topology->binding_hooks.set_thisthread_cpubind)
+    if (topology->binding_hooks.set_thisproc_cpubind) {
+      int err = topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.set_thisthread_cpubind)
       return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
   }
 
@@ -87,6 +91,11 @@ hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags
 int
 hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (flags & HWLOC_CPUBIND_PROCESS) {
     if (topology->binding_hooks.get_thisproc_cpubind)
       return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
@@ -94,9 +103,13 @@ hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
     if (topology->binding_hooks.get_thisthread_cpubind)
       return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
   } else {
-    if (topology->binding_hooks.get_thisproc_cpubind)
-      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
-    else if (topology->binding_hooks.get_thisthread_cpubind)
+    if (topology->binding_hooks.get_thisproc_cpubind) {
+      int err = topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_cpubind)
       return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
   }
 
@@ -107,6 +120,11 @@ hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
 int
 hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   set = hwloc_fix_cpubind(topology, set);
   if (!set)
     return -1;
@@ -121,6 +139,11 @@ hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_b
 int
 hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (topology->binding_hooks.get_proc_cpubind)
     return topology->binding_hooks.get_proc_cpubind(topology, pid, set, flags);
 
@@ -132,6 +155,11 @@ hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_
 int
 hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   set = hwloc_fix_cpubind(topology, set);
   if (!set)
     return -1;
@@ -146,6 +174,11 @@ hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_co
 int
 hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (topology->binding_hooks.get_thread_cpubind)
     return topology->binding_hooks.get_thread_cpubind(topology, tid, set, flags);
 
@@ -157,6 +190,11 @@ hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bi
 int
 hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (flags & HWLOC_CPUBIND_PROCESS) {
     if (topology->binding_hooks.get_thisproc_last_cpu_location)
       return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
@@ -164,9 +202,13 @@ hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int f
     if (topology->binding_hooks.get_thisthread_last_cpu_location)
       return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
   } else {
-    if (topology->binding_hooks.get_thisproc_last_cpu_location)
-      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
-    else if (topology->binding_hooks.get_thisthread_last_cpu_location)
+    if (topology->binding_hooks.get_thisproc_last_cpu_location) {
+      int err = topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_last_cpu_location)
       return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
   }
 
@@ -177,6 +219,11 @@ hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int f
 int
 hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
 {
+  if (flags & ~HWLOC_CPUBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (topology->binding_hooks.get_proc_last_cpu_location)
     return topology->binding_hooks.get_proc_last_cpu_location(topology, pid, set, flags);
 
@@ -184,25 +231,14 @@ hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwl
   return -1;
 }
 
+#define HWLOC_MEMBIND_ALLFLAGS (HWLOC_MEMBIND_PROCESS|HWLOC_MEMBIND_THREAD|HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_MIGRATE|HWLOC_MEMBIND_NOCPUBIND|HWLOC_MEMBIND_BYNODESET)
+
 static hwloc_const_nodeset_t
 hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
 {
   hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
   hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
 
-  if (!hwloc_topology_get_topology_cpuset(topology)) {
-    /* The topology is composed of several systems, the nodeset is thus
-     * ambiguous. */
-    errno = EXDEV;
-    return NULL;
-  }
-
-  if (!complete_nodeset) {
-    /* There is no NUMA node */
-    errno = ENODEV;
-    return NULL;
-  }
-
   if (hwloc_bitmap_iszero(nodeset)) {
     errno = EINVAL;
     return NULL;
@@ -226,19 +262,6 @@ hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwl
   hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
   hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
 
-  if (!topology_set) {
-    /* The topology is composed of several systems, the cpuset is thus
-     * ambiguous. */
-    errno = EXDEV;
-    return -1;
-  }
-
-  if (!complete_nodeset) {
-    /* There is no NUMA node */
-    errno = ENODEV;
-    return -1;
-  }
-
   if (hwloc_bitmap_iszero(cpuset)) {
     errno = EINVAL;
     return -1;
@@ -258,9 +281,25 @@ hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwl
   return 0;
 }
 
-int
-hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+static __hwloc_inline int hwloc__check_membind_policy(hwloc_membind_policy_t policy)
+{
+  if (policy == HWLOC_MEMBIND_DEFAULT
+      || policy == HWLOC_MEMBIND_FIRSTTOUCH
+      || policy == HWLOC_MEMBIND_BIND
+      || policy == HWLOC_MEMBIND_INTERLEAVE
+      || policy == HWLOC_MEMBIND_NEXTTOUCH)
+    return 0;
+  return -1;
+}
+
+static int
+hwloc_set_membind_by_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
   nodeset = hwloc_fix_membind(topology, nodeset);
   if (!nodeset)
     return -1;
@@ -272,9 +311,13 @@ hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodes
     if (topology->binding_hooks.set_thisthread_membind)
       return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
   } else {
-    if (topology->binding_hooks.set_thisproc_membind)
-      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
-    else if (topology->binding_hooks.set_thisthread_membind)
+    if (topology->binding_hooks.set_thisproc_membind) {
+      int err = topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.set_thisthread_membind)
       return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
   }
 
@@ -283,23 +326,31 @@ hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodes
 }
 
 int
-hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
 {
-  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
   int ret;
 
-  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
-    ret = -1;
-  else
-    ret = hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
-
-  hwloc_bitmap_free(nodeset);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_membind_by_nodeset(topology, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_membind_by_nodeset(topology, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
   return ret;
 }
 
-int
-hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+static int
+hwloc_get_membind_by_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
 {
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (flags & HWLOC_MEMBIND_PROCESS) {
     if (topology->binding_hooks.get_thisproc_membind)
       return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
@@ -307,9 +358,13 @@ hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hw
     if (topology->binding_hooks.get_thisthread_membind)
       return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
   } else {
-    if (topology->binding_hooks.get_thisproc_membind)
-      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
-    else if (topology->binding_hooks.get_thisthread_membind)
+    if (topology->binding_hooks.get_thisproc_membind) {
+      int err = topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+      if (err >= 0 || errno != ENOSYS)
+        return err;
+      /* ENOSYS, fallback */
+    }
+    if (topology->binding_hooks.get_thisthread_membind)
       return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
   }
 
@@ -318,24 +373,31 @@ hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hw
 }
 
 int
-hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
 {
-  hwloc_nodeset_t nodeset;
   int ret;
 
-  nodeset = hwloc_bitmap_alloc();
-  ret = hwloc_get_membind_nodeset(topology, nodeset, policy, flags);
-
-  if (!ret)
-    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_membind_by_nodeset(topology, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_membind_by_nodeset(topology, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
-int
-hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+static int
+hwloc_set_proc_membind_by_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
   nodeset = hwloc_fix_membind(topology, nodeset);
   if (!nodeset)
     return -1;
@@ -349,23 +411,32 @@ hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc
 
 
 int
-hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
 {
-  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
   int ret;
 
-  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
-    ret = -1;
-  else
-    ret = hwloc_set_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_proc_membind_by_nodeset(topology, pid, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_proc_membind_by_nodeset(topology, pid, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
-int
-hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+static int
+hwloc_get_proc_membind_by_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
 {
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
   if (topology->binding_hooks.get_proc_membind)
     return topology->binding_hooks.get_proc_membind(topology, pid, nodeset, policy, flags);
 
@@ -374,24 +445,35 @@ hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc
 }
 
 int
-hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
 {
-  hwloc_nodeset_t nodeset;
   int ret;
 
-  nodeset = hwloc_bitmap_alloc();
-  ret = hwloc_get_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
-
-  if (!ret)
-    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_proc_membind_by_nodeset(topology, pid, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_proc_membind_by_nodeset(topology, pid, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
-int
-hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+static int
+hwloc_set_area_membind_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len)
+    /* nothing to do */
+    return 0;
+
   nodeset = hwloc_fix_membind(topology, nodeset);
   if (!nodeset)
     return -1;
@@ -404,23 +486,38 @@ hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size
 }
 
 int
-hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
 {
-  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
   int ret;
 
-  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
-    ret = -1;
-  else
-    ret = hwloc_set_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_set_area_membind_by_nodeset(topology, addr, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+      ret = -1;
+    else
+      ret = hwloc_set_area_membind_by_nodeset(topology, addr, len, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
-int
-hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+static int
+hwloc_get_area_membind_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
 {
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len) {
+    /* nothing to query */
+    errno = EINVAL;
+    return -1;
+  }
+
   if (topology->binding_hooks.get_area_membind)
     return topology->binding_hooks.get_area_membind(topology, addr, len, nodeset, policy, flags);
 
@@ -429,25 +526,64 @@ hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size
 }
 
 int
-hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags)
 {
-  hwloc_nodeset_t nodeset;
   int ret;
 
-  nodeset = hwloc_bitmap_alloc();
-  ret = hwloc_get_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_area_membind_by_nodeset(topology, addr, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_area_membind_by_nodeset(topology, addr, len, nodeset, policy, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
+
+  return ret;
+}
+
+static int
+hwloc_get_area_memlocation_by_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags)
+{
+  if (flags & ~HWLOC_MEMBIND_ALLFLAGS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!len)
+    /* nothing to do */
+    return 0;
+
+  if (topology->binding_hooks.get_area_memlocation)
+    return topology->binding_hooks.get_area_memlocation(topology, addr, len, nodeset, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, int flags)
+{
+  int ret;
 
-  if (!ret)
-    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_get_area_memlocation_by_nodeset(topology, addr, len, set, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    ret = hwloc_get_area_memlocation_by_nodeset(topology, addr, len, nodeset, flags);
+    if (!ret)
+      hwloc_cpuset_from_nodeset(topology, set, nodeset);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
 void *
 hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
 {
-  void *p;
+  void *p = NULL;
 #if defined(hwloc_getpagesize) && defined(HAVE_POSIX_MEMALIGN)
   errno = posix_memalign(&p, hwloc_getpagesize(), len);
   if (errno)
@@ -464,7 +600,8 @@ hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
 void *
 hwloc_alloc_mmap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
 {
-  return mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  void * buffer = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  return buffer == MAP_FAILED ? NULL : buffer;
 }
 #endif
 
@@ -493,10 +630,16 @@ hwloc_alloc(hwloc_topology_t topology, size_t len)
   return hwloc_alloc_heap(topology, len);
 }
 
-void *
-hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+static void *
+hwloc_alloc_membind_by_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
   void *p;
+
+  if ((flags & ~HWLOC_MEMBIND_ALLFLAGS) || hwloc__check_membind_policy(policy) < 0) {
+    errno = EINVAL;
+    return NULL;
+  }
+
   nodeset = hwloc_fix_membind(topology, nodeset);
   if (!nodeset)
     goto fallback;
@@ -531,20 +674,24 @@ hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_n
 }
 
 void *
-hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags)
 {
-  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
   void *ret;
 
-  if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
-    if (flags & HWLOC_MEMBIND_STRICT)
-      ret = NULL;
-    else
-      ret = hwloc_alloc(topology, len);
-  } else
-    ret = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+  if (flags & HWLOC_MEMBIND_BYNODESET) {
+    ret = hwloc_alloc_membind_by_nodeset(topology, len, set, policy, flags);
+  } else {
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
+      if (flags & HWLOC_MEMBIND_STRICT)
+	ret = NULL;
+      else
+	ret = hwloc_alloc(topology, len);
+    } else
+      ret = hwloc_alloc_membind_by_nodeset(topology, len, nodeset, policy, flags);
+    hwloc_bitmap_free(nodeset);
+  }
 
-  hwloc_bitmap_free(nodeset);
   return ret;
 }
 
@@ -562,12 +709,8 @@ hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
 
 static int dontset_return_complete_cpuset(hwloc_topology_t topology, hwloc_cpuset_t set)
 {
-  hwloc_const_cpuset_t cpuset = hwloc_topology_get_complete_cpuset(topology);
-  if (cpuset) {
-    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-    return 0;
-  } else
-    return -1;
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+  return 0;
 }
 
 static int dontset_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
@@ -607,13 +750,9 @@ static int dontget_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_un
 
 static int dontset_return_complete_nodeset(hwloc_topology_t topology, hwloc_nodeset_t set, hwloc_membind_policy_t *policy)
 {
-  hwloc_const_nodeset_t nodeset = hwloc_topology_get_complete_nodeset(topology);
-  if (nodeset) {
-    hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
-    *policy = HWLOC_MEMBIND_DEFAULT;
-    return 0;
-  } else
-    return -1;
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
+  *policy = HWLOC_MEMBIND_MIXED;
+  return 0;
 }
 
 static int dontset_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
@@ -651,6 +790,11 @@ static int dontget_area_membind(hwloc_topology_t topology __hwloc_attribute_unus
 {
   return dontset_return_complete_nodeset(topology, set, policy);
 }
+static int dontget_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  hwloc_membind_policy_t policy;
+  return dontset_return_complete_nodeset(topology, set, &policy);
+}
 
 static void * dontalloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
 {
@@ -687,6 +831,7 @@ static void hwloc_set_dummy_hooks(struct hwloc_binding_hooks *hooks,
   hooks->get_proc_membind = dontget_proc_membind;
   hooks->set_area_membind = dontset_area_membind;
   hooks->get_area_membind = dontget_area_membind;
+  hooks->get_area_memlocation = dontget_area_memlocation;
   hooks->alloc_membind = dontalloc_membind;
   hooks->free_membind = dontfree_membind;
 }
@@ -706,10 +851,6 @@ hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_t
     hwloc_set_aix_hooks(hooks, support);
 #    endif /* HWLOC_AIX_SYS */
 
-#    ifdef HWLOC_OSF_SYS
-    hwloc_set_osf_hooks(hooks, support);
-#    endif /* HWLOC_OSF_SYS */
-
 #    ifdef HWLOC_SOLARIS_SYS
     hwloc_set_solaris_hooks(hooks, support);
 #    endif /* HWLOC_SOLARIS_SYS */
@@ -745,6 +886,8 @@ hwloc_set_binding_hooks(struct hwloc_topology *topology)
   } else {
     /* not this system, use dummy binding hooks that do nothing (but don't return ENOSYS) */
     hwloc_set_dummy_hooks(&topology->binding_hooks, &topology->support);
+
+    /* Linux has some hooks that also work in this case, but they are not strictly needed yet. */
   }
 
   /* if not is_thissystem, set_cpubind is fake
@@ -776,6 +919,7 @@ hwloc_set_binding_hooks(struct hwloc_topology *topology)
     DO(mem,get_proc_membind);
     DO(mem,set_area_membind);
     DO(mem,get_area_membind);
+    DO(mem,get_area_memlocation);
     DO(mem,alloc_membind);
   }
 }
diff --git a/ext/hwloc/hwloc/bitmap.c b/ext/hwloc/hwloc/bitmap.c
index e2b807a61..5fb9cd356 100644
--- a/ext/hwloc/hwloc/bitmap.c
+++ b/ext/hwloc/hwloc/bitmap.c
@@ -1,17 +1,18 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2011 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc.h>
-#include <private/misc.h>
-#include <private/private.h>
-#include <hwloc/bitmap.h>
+#include "private/autogen/config.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc.h"
+#include "private/misc.h"
+#include "private/private.h"
+#include "private/debug.h"
+#include "hwloc/bitmap.h"
 
 #include <stdarg.h>
 #include <stdio.h>
@@ -24,6 +25,8 @@
  * - have a way to change the initial allocation size:
  *   add hwloc_bitmap_set_foo() to changes a global here,
  *   and make the hwloc core call based on the early number of PUs
+ * - make HWLOC_BITMAP_PREALLOC_BITS configurable, and detectable
+ *   by parsing /proc/cpuinfo during configure on Linux.
  * - preallocate inside the bitmap structure (so that the whole structure is a cacheline for instance)
  *   and allocate a dedicated array only later when reallocating larger
  * - add a bitmap->ulongs_empty_first which guarantees that some first ulongs are empty,
@@ -35,6 +38,10 @@
 /* magic number */
 #define HWLOC_BITMAP_MAGIC 0x20091007
 
+/* preallocated bits in every bitmap */
+#define HWLOC_BITMAP_PREALLOC_BITS 512
+#define HWLOC_BITMAP_PREALLOC_ULONGS (HWLOC_BITMAP_PREALLOC_BITS/HWLOC_BITS_PER_LONG)
+
 /* actual opaque type internals */
 struct hwloc_bitmap_s {
   unsigned ulongs_count; /* how many ulong bitmasks are valid, >= 1 */
@@ -83,8 +90,8 @@ struct hwloc_bitmap_s * hwloc_bitmap_alloc(void)
     return NULL;
 
   set->ulongs_count = 1;
-  set->ulongs_allocated = 64/sizeof(unsigned long);
-  set->ulongs = malloc(64);
+  set->ulongs_allocated = HWLOC_BITMAP_PREALLOC_ULONGS;
+  set->ulongs = malloc(HWLOC_BITMAP_PREALLOC_ULONGS * sizeof(unsigned long));
   if (!set->ulongs) {
     free(set);
     return NULL;
@@ -124,21 +131,29 @@ void hwloc_bitmap_free(struct hwloc_bitmap_s * set)
 
 /* enlarge until it contains at least needed_count ulongs.
  */
-static void
+static int
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
 hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
 {
-  unsigned tmp = 1 << hwloc_flsl((unsigned long) needed_count - 1);
+  unsigned tmp = 1U << hwloc_flsl((unsigned long) needed_count - 1);
   if (tmp > set->ulongs_allocated) {
-    set->ulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
-    assert(set->ulongs);
+    unsigned long *tmpulongs;
+    tmpulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
+    if (!tmpulongs)
+      return -1;
+    set->ulongs = tmpulongs;
     set->ulongs_allocated = tmp;
   }
+  return 0;
 }
 
 /* enlarge until it contains at least needed_count ulongs,
  * and update new ulongs according to the infinite field.
  */
-static void
+static int
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
 hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
 {
   unsigned i;
@@ -146,15 +161,17 @@ hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_coun
   HWLOC__BITMAP_CHECK(set);
 
   if (needed_count <= set->ulongs_count)
-    return;
+    return 0;
 
   /* realloc larger if needed */
-  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+  if (hwloc_bitmap_enlarge_by_ulongs(set, needed_count) < 0)
+    return -1;
 
   /* fill the newly allocated subset depending on the infinite flag */
   for(i=set->ulongs_count; i<needed_count; i++)
     set->ulongs[i] = set->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
   set->ulongs_count = needed_count;
+  return 0;
 }
 
 /* realloc until it contains at least cpu+1 bits */
@@ -163,11 +180,15 @@ hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_coun
 /* reset a bitmap to exactely the needed size.
  * the caller must reinitialize all ulongs and the infinite flag later.
  */
-static void
+static int
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count) __hwloc_attribute_warn_unused_result;
+static int
 hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
 {
-  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+  if (hwloc_bitmap_enlarge_by_ulongs(set, needed_count))
+    return -1;
   set->ulongs_count = needed_count;
+  return 0;
 }
 
 /* reset until it contains exactly cpu+1 bits (roundup to a ulong).
@@ -175,7 +196,7 @@ hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
  */
 #define hwloc_bitmap_reset_by_cpu_index(set, cpu) hwloc_bitmap_reset_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
 
-struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+struct hwloc_bitmap_s * hwloc_bitmap_tma_dup(struct hwloc_tma *tma, const struct hwloc_bitmap_s * old)
 {
   struct hwloc_bitmap_s * new;
 
@@ -184,11 +205,11 @@ struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
 
   HWLOC__BITMAP_CHECK(old);
 
-  new = malloc(sizeof(struct hwloc_bitmap_s));
+  new = hwloc_tma_malloc(tma, sizeof(struct hwloc_bitmap_s));
   if (!new)
     return NULL;
 
-  new->ulongs = malloc(old->ulongs_allocated * sizeof(unsigned long));
+  new->ulongs = hwloc_tma_malloc(tma, old->ulongs_allocated * sizeof(unsigned long));
   if (!new->ulongs) {
     free(new);
     return NULL;
@@ -203,15 +224,22 @@ struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
   return new;
 }
 
-void hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
+struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+{
+  return hwloc_bitmap_tma_dup(NULL, old);
+}
+
+int hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
 {
   HWLOC__BITMAP_CHECK(dst);
   HWLOC__BITMAP_CHECK(src);
 
-  hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count);
+  if (hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count) < 0)
+    return -1;
 
   memcpy(dst->ulongs, src->ulongs, src->ulongs_count * sizeof(unsigned long));
   dst->infinite = src->infinite;
+  return 0;
 }
 
 /* Strings always use 32bit groups */
@@ -248,12 +276,12 @@ int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const stru
       return -1;
     ret += res;
     if (res >= size)
-      res = size>0 ? size - 1 : 0;
+      res = size>0 ? (int)size - 1 : 0;
     tmp += res;
     size -= res;
   }
 
-  i=set->ulongs_count-1;
+  i=(int) set->ulongs_count-1;
 
   if (set->infinite) {
     /* ignore starting FULL since we have 0xf...f already */
@@ -298,7 +326,7 @@ int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const stru
 #endif
 
     if (res >= size)
-      res = size>0 ? size - 1 : 0;
+      res = size>0 ? (int)size - 1 : 0;
 
     tmp += res;
     size -= res;
@@ -324,6 +352,8 @@ int hwloc_bitmap_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_re
 
   len = hwloc_bitmap_snprintf(NULL, 0, set);
   buf = malloc(len+1);
+  if (!buf)
+    return -1;
   *strp = buf;
   return hwloc_bitmap_snprintf(buf, len+1, set);
 }
@@ -353,7 +383,8 @@ int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restric
     count--;
   }
 
-  hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG);
+  if (hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG) < 0)
+    return -1;
   set->infinite = 0;
 
   while (*current != '\0') {
@@ -392,7 +423,6 @@ int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restric
 int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
 {
   int prev = -1;
-  hwloc_bitmap_t reverse;
   ssize_t size = buflen;
   char *tmp = buf;
   int res, ret = 0;
@@ -400,9 +430,6 @@ int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const
 
   HWLOC__BITMAP_CHECK(set);
 
-  reverse = hwloc_bitmap_alloc(); /* FIXME: add hwloc_bitmap_alloc_size() + hwloc_bitmap_init_allocated() to avoid malloc? */
-  hwloc_bitmap_not(reverse, set);
-
   /* mark the end in case we do nothing later */
   if (buflen > 0)
     tmp[0] = '\0';
@@ -413,7 +440,7 @@ int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const
     begin = hwloc_bitmap_next(set, prev);
     if (begin == -1)
       break;
-    end = hwloc_bitmap_next(reverse, begin);
+    end = hwloc_bitmap_next_unset(set, begin);
 
     if (end == begin+1) {
       res = hwloc_snprintf(tmp, size, needcomma ? ",%d" : "%d", begin);
@@ -422,14 +449,12 @@ int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const
     } else {
       res = hwloc_snprintf(tmp, size, needcomma ? ",%d-%d" : "%d-%d", begin, end-1);
     }
-    if (res < 0) {
-      hwloc_bitmap_free(reverse);
+    if (res < 0)
       return -1;
-    }
     ret += res;
 
     if (res >= size)
-      res = size>0 ? size - 1 : 0;
+      res = size>0 ? (int)size - 1 : 0;
 
     tmp += res;
     size -= res;
@@ -441,8 +466,6 @@ int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const
       prev = end - 1;
   }
 
-  hwloc_bitmap_free(reverse);
-
   return ret;
 }
 
@@ -455,6 +478,8 @@ int hwloc_bitmap_list_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwl
 
   len = hwloc_bitmap_list_snprintf(NULL, 0, set);
   buf = malloc(len+1);
+  if (!buf)
+    return -1;
   *strp = buf;
   return hwloc_bitmap_list_snprintf(buf, len+1, set);
 }
@@ -470,7 +495,7 @@ int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_re
   while (*current != '\0') {
 
     /* ignore empty ranges */
-    while (*current == ',')
+    while (*current == ',' || *current == ' ')
       current++;
 
     val = strtoul(current, &next, 0);
@@ -494,7 +519,7 @@ int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_re
 	begin = val;
       }
 
-    } else if (*next == ',' || *next == '\0') {
+    } else if (*next == ',' || *next == ' ' || *next == '\0') {
       /* single digit */
       hwloc_bitmap_set(set, val);
     }
@@ -533,7 +558,7 @@ int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, co
       return -1;
     ret += res;
     if (res >= size)
-      res = size>0 ? size - 1 : 0;
+      res = size>0 ? (int)size - 1 : 0;
     tmp += res;
     size -= res;
   }
@@ -569,7 +594,7 @@ int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, co
       return -1;
     ret += res;
     if (res >= size)
-      res = size>0 ? size - 1 : 0;
+      res = size>0 ? (int)size - 1 : 0;
     tmp += res;
     size -= res;
   }
@@ -594,6 +619,8 @@ int hwloc_bitmap_taskset_asprintf(char ** strp, const struct hwloc_bitmap_s * __
 
   len = hwloc_bitmap_taskset_snprintf(NULL, 0, set);
   buf = malloc(len+1);
+  if (!buf)
+    return -1;
   *strp = buf;
   return hwloc_bitmap_taskset_snprintf(buf, len+1, set);
 }
@@ -605,7 +632,6 @@ int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc
   int count;
   int infinite = 0;
 
-  current = string;
   if (!strncmp("0xf...f", current, 7)) {
     /* infinite bitmap */
     infinite = 1;
@@ -627,10 +653,11 @@ int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc
   }
   /* we know there are other characters now */
 
-  chars = strlen(current);
+  chars = (int)strlen(current);
   count = (chars * 4 + HWLOC_BITS_PER_LONG - 1) / HWLOC_BITS_PER_LONG;
 
-  hwloc_bitmap_reset_by_ulongs(set, count);
+  if (hwloc_bitmap_reset_by_ulongs(set, count) < 0)
+    return -1;
   set->infinite = 0;
 
   while (*current != '\0') {
@@ -678,7 +705,12 @@ void hwloc_bitmap_zero(struct hwloc_bitmap_s * set)
 {
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_ulongs(set, 1);
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we preallocate some ulongs.
+		 * if we ever preallocate nothing, we'll reset to 0 ulongs.
+		 */
+	}
 	hwloc_bitmap__zero(set);
 }
 
@@ -694,30 +726,59 @@ void hwloc_bitmap_fill(struct hwloc_bitmap_s * set)
 {
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_ulongs(set, 1);
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we pre-allocate some ulongs.
+		 * if we ever pre-allocate nothing, we'll reset to 0 ulongs.
+		 */
+	}
 	hwloc_bitmap__fill(set);
 }
 
-void hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
+int hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
 {
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_ulongs(set, 1);
+	HWLOC_BUILD_ASSERT(HWLOC_BITMAP_PREALLOC_ULONGS >= 1);
+	if (hwloc_bitmap_reset_by_ulongs(set, 1) < 0) {
+		/* cannot fail since we pre-allocate some ulongs.
+		 * if ever pre-allocate nothing, we may have to return a failure.
+		 */
+	}
 	set->ulongs[0] = mask; /* there's always at least one ulong allocated */
 	set->infinite = 0;
+	return 0;
 }
 
-void hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+int hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
 {
 	unsigned j;
 
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_ulongs(set, i+1);
+	if (hwloc_bitmap_reset_by_ulongs(set, i+1) < 0)
+		return -1;
+
 	set->ulongs[i] = mask;
 	for(j=0; j<i; j++)
 		set->ulongs[j] = HWLOC_SUBBITMAP_ZERO;
 	set->infinite = 0;
+	return 0;
+}
+
+int hwloc_bitmap_from_ulongs(struct hwloc_bitmap_s *set, unsigned nr, const unsigned long *masks)
+{
+	unsigned j;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (hwloc_bitmap_reset_by_ulongs(set, nr) < 0)
+		return -1;
+
+	for(j=0; j<nr; j++)
+		set->ulongs[j] = masks[j];
+	set->infinite = 0;
+	return 0;
 }
 
 unsigned long hwloc_bitmap_to_ulong(const struct hwloc_bitmap_s *set)
@@ -734,29 +795,59 @@ unsigned long hwloc_bitmap_to_ith_ulong(const struct hwloc_bitmap_s *set, unsign
 	return HWLOC_SUBBITMAP_READULONG(set, i);
 }
 
-void hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
+int hwloc_bitmap_to_ulongs(const struct hwloc_bitmap_s *set, unsigned nr, unsigned long *masks)
+{
+	unsigned j;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(j=0; j<nr; j++)
+		masks[j] = HWLOC_SUBBITMAP_READULONG(set, j);
+	return 0;
+}
+
+int hwloc_bitmap_nr_ulongs(const struct hwloc_bitmap_s *set)
+{
+	unsigned last;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	last = hwloc_bitmap_last(set);
+	return (last + HWLOC_BITS_PER_LONG-1)/HWLOC_BITS_PER_LONG;
+}
+
+int hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
 {
 	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
 
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	if (hwloc_bitmap_reset_by_cpu_index(set, cpu) < 0)
+		return -1;
+
 	hwloc_bitmap__zero(set);
 	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
 }
 
-void hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
+int hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
 {
 	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
 
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	if (hwloc_bitmap_reset_by_cpu_index(set, cpu) < 0)
+		return -1;
+
 	hwloc_bitmap__fill(set);
 	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
 }
 
-void hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
+int hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
 {
 	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
 
@@ -764,13 +855,16 @@ void hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
 
 	/* nothing to do if setting inside the infinite part of the bitmap */
 	if (set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
-		return;
+		return 0;
+
+	if (hwloc_bitmap_realloc_by_cpu_index(set, cpu) < 0)
+		return -1;
 
-	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
 	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
 }
 
-void hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+int hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
 {
 	unsigned i;
 	unsigned beginset,endset;
@@ -778,43 +872,66 @@ void hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int
 
 	HWLOC__BITMAP_CHECK(set);
 
+	if (endcpu < begincpu)
+		return 0;
+	if (set->infinite && begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* setting only in the already-set infinite part, nothing to do */
+		return 0;
+
 	if (_endcpu == -1) {
+		/* infinite range */
+
+		/* make sure we can play with the ulong that contains begincpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, begincpu) < 0)
+			return -1;
+
+		/* update the ulong that contains begincpu */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		/* set ulongs after begincpu if any already allocated */
+		for(i=beginset+1; i<set->ulongs_count; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+		/* mark the infinity as set */
 		set->infinite = 1;
-		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
-	}
+	} else {
+		/* finite range */
 
-	if (set->infinite) {
-		/* truncate the range according to the infinite part of the bitmap */
-		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* ignore the part of the range that overlaps with the already-set infinite part */
+		if (set->infinite && endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
 			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
-		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
-			return;
-	}
-	if (endcpu < begincpu)
-		return;
-	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+		/* make sure we can play with the ulongs that contain begincpu and endcpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, endcpu) < 0)
+			return -1;
 
-	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
-	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
-	for(i=beginset+1; i<endset; i++)
-		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
-	if (beginset == endset) {
-		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
-	} else {
-		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
-		set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		/* update first and last ulongs */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+		if (beginset == endset) {
+			set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		} else {
+			set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+			set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		}
+		/* set ulongs in the middle of the range */
+		for(i=beginset+1; i<endset; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
 	}
+
+	return 0;
 }
 
-void hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+int hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
 {
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_realloc_by_ulongs(set, i+1);
+	if (hwloc_bitmap_realloc_by_ulongs(set, i+1) < 0)
+		return -1;
+
 	set->ulongs[i] = mask;
+	return 0;
 }
 
-void hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
+int hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
 {
 	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
 
@@ -822,13 +939,16 @@ void hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
 
 	/* nothing to do if clearing inside the infinitely-unset part of the bitmap */
 	if (!set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
-		return;
+		return 0;
+
+	if (hwloc_bitmap_realloc_by_cpu_index(set, cpu) < 0)
+		return -1;
 
-	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
 	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+	return 0;
 }
 
-void hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+int hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
 {
 	unsigned i;
 	unsigned beginset,endset;
@@ -836,32 +956,53 @@ void hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int
 
 	HWLOC__BITMAP_CHECK(set);
 
+	if (endcpu < begincpu)
+		return 0;
+
+	if (!set->infinite && begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* clearing only in the already-unset infinite part, nothing to do */
+		return 0;
+
 	if (_endcpu == -1) {
+		/* infinite range */
+
+		/* make sure we can play with the ulong that contains begincpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, begincpu) < 0)
+			return -1;
+
+		/* update the ulong that contains begincpu */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		/* clear ulong after begincpu if any already allocated */
+		for(i=beginset+1; i<set->ulongs_count; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+		/* mark the infinity as unset */
 		set->infinite = 0;
-		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
-	}
+	} else {
+		/* finite range */
 
-	if (!set->infinite) {
-		/* truncate the range according to the infinitely-unset part of the bitmap */
-		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		/* ignore the part of the range that overlaps with the already-unset infinite part */
+		if (!set->infinite && endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
 			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
-		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
-			return;
-	}
-	if (endcpu < begincpu)
-		return;
-	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+		/* make sure we can play with the ulongs that contain begincpu and endcpu */
+		if (hwloc_bitmap_realloc_by_cpu_index(set, endcpu) < 0)
+			return -1;
 
-	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
-	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
-	for(i=beginset+1; i<endset; i++)
-		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
-	if (beginset == endset) {
-		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
-	} else {
-		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
-		set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		/* update first and last ulongs */
+		beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+		endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+		if (beginset == endset) {
+			set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		} else {
+			set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+			set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+		}
+		/* clear ulongs in the middle of the range */
+		for(i=beginset+1; i<endset; i++)
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
 	}
+
+	return 0;
 }
 
 int hwloc_bitmap_isset(const struct hwloc_bitmap_s * set, unsigned cpu)
@@ -998,7 +1139,7 @@ int hwloc_bitmap_isincluded (const struct hwloc_bitmap_s *sub_set, const struct
 	return 1;
 }
 
-void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+int hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
 {
 	/* cache counts so that we can reset res even if it's also set1 or set2 */
 	unsigned count1 = set1->ulongs_count;
@@ -1011,7 +1152,8 @@ void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *s
 	HWLOC__BITMAP_CHECK(set1);
 	HWLOC__BITMAP_CHECK(set2);
 
-	hwloc_bitmap_reset_by_ulongs(res, max_count);
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
 
 	for(i=0; i<min_count; i++)
 		res->ulongs[i] = set1->ulongs[i] | set2->ulongs[i];
@@ -1035,9 +1177,10 @@ void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *s
 	}
 
 	res->infinite = set1->infinite || set2->infinite;
+	return 0;
 }
 
-void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+int hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
 {
 	/* cache counts so that we can reset res even if it's also set1 or set2 */
 	unsigned count1 = set1->ulongs_count;
@@ -1050,7 +1193,8 @@ void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *
 	HWLOC__BITMAP_CHECK(set1);
 	HWLOC__BITMAP_CHECK(set2);
 
-	hwloc_bitmap_reset_by_ulongs(res, max_count);
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
 
 	for(i=0; i<min_count; i++)
 		res->ulongs[i] = set1->ulongs[i] & set2->ulongs[i];
@@ -1074,9 +1218,10 @@ void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *
 	}
 
 	res->infinite = set1->infinite && set2->infinite;
+	return 0;
 }
 
-void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+int hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
 {
 	/* cache counts so that we can reset res even if it's also set1 or set2 */
 	unsigned count1 = set1->ulongs_count;
@@ -1089,7 +1234,8 @@ void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_
 	HWLOC__BITMAP_CHECK(set1);
 	HWLOC__BITMAP_CHECK(set2);
 
-	hwloc_bitmap_reset_by_ulongs(res, max_count);
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
 
 	for(i=0; i<min_count; i++)
 		res->ulongs[i] = set1->ulongs[i] & ~set2->ulongs[i];
@@ -1113,9 +1259,10 @@ void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_
 	}
 
 	res->infinite = set1->infinite && !set2->infinite;
+	return 0;
 }
 
-void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+int hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
 {
 	/* cache counts so that we can reset res even if it's also set1 or set2 */
 	unsigned count1 = set1->ulongs_count;
@@ -1128,7 +1275,8 @@ void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *
 	HWLOC__BITMAP_CHECK(set1);
 	HWLOC__BITMAP_CHECK(set2);
 
-	hwloc_bitmap_reset_by_ulongs(res, max_count);
+	if (hwloc_bitmap_reset_by_ulongs(res, max_count) < 0)
+		return -1;
 
 	for(i=0; i<min_count; i++)
 		res->ulongs[i] = set1->ulongs[i] ^ set2->ulongs[i];
@@ -1146,9 +1294,10 @@ void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *
 	}
 
 	res->infinite = (!set1->infinite) != (!set2->infinite);
+	return 0;
 }
 
-void hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
+int hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
 {
 	unsigned count = set->ulongs_count;
 	unsigned i;
@@ -1156,12 +1305,14 @@ void hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *
 	HWLOC__BITMAP_CHECK(res);
 	HWLOC__BITMAP_CHECK(set);
 
-	hwloc_bitmap_reset_by_ulongs(res, count);
+	if (hwloc_bitmap_reset_by_ulongs(res, count) < 0)
+		return -1;
 
 	for(i=0; i<count; i++)
 		res->ulongs[i] = ~set->ulongs[i];
 
 	res->infinite = !set->infinite;
+	return 0;
 }
 
 int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
@@ -1183,6 +1334,25 @@ int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
 	return -1;
 }
 
+int hwloc_bitmap_first_unset(const struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = ~set->ulongs[i];
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (!set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
 int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
 {
 	int i;
@@ -1192,7 +1362,7 @@ int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
 	if (set->infinite)
 		return -1;
 
-	for(i=set->ulongs_count-1; i>=0; i--) {
+	for(i=(int)set->ulongs_count-1; i>=0; i--) {
 		/* subsets are unsigned longs, use flsl */
 		unsigned long w = set->ulongs[i];
 		if (w)
@@ -1202,6 +1372,25 @@ int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
 	return -1;
 }
 
+int hwloc_bitmap_last_unset(const struct hwloc_bitmap_s * set)
+{
+	int i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (!set->infinite)
+		return -1;
+
+	for(i=(int)set->ulongs_count-1; i>=0; i--) {
+		/* subsets are unsigned longs, use flsl */
+		unsigned long w = ~set->ulongs[i];
+		if (w)
+			return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	return -1;
+}
+
 int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
 {
 	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
@@ -1234,7 +1423,39 @@ int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
 	return -1;
 }
 
-void hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
+int hwloc_bitmap_next_unset(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (i >= set->ulongs_count) {
+		if (!set->infinite)
+			return prev_cpu + 1;
+		else
+			return -1;
+	}
+
+	for(; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = ~set->ulongs[i];
+
+		/* if the prev cpu is in the same word as the possible next one,
+		   we need to mask out previous cpus */
+		if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+			w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (!set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
 {
 	unsigned i;
 	int found = 0;
@@ -1263,9 +1484,11 @@ void hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
 			/* set the first non allocated bit */
 			unsigned first = set->ulongs_count * HWLOC_BITS_PER_LONG;
 			set->infinite = 0; /* do not let realloc fill the newly allocated sets */
-			hwloc_bitmap_set(set, first);
+			return hwloc_bitmap_set(set, first);
 		}
 	}
+
+	return 0;
 }
 
 int hwloc_bitmap_compare_first(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
@@ -1333,7 +1556,7 @@ int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_
 	if (count1 != count2) {
 		if (min_count < count2) {
 			unsigned long val1 = set1->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
-			for(i=max_count-1; i>=(signed) min_count; i--) {
+			for(i=(int)max_count-1; i>=(int) min_count; i--) {
 				unsigned long val2 = set2->ulongs[i];
 				if (val1 == val2)
 					continue;
@@ -1341,7 +1564,7 @@ int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_
 			}
 		} else {
 			unsigned long val2 = set2->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
-			for(i=max_count-1; i>=(signed) min_count; i--) {
+			for(i=(int)max_count-1; i>=(int) min_count; i--) {
 				unsigned long val1 = set1->ulongs[i];
 				if (val1 == val2)
 					continue;
@@ -1350,7 +1573,7 @@ int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_
 		}
 	}
 
-	for(i=min_count-1; i>=0; i--) {
+	for(i=(int)min_count-1; i>=0; i--) {
 		unsigned long val1 = set1->ulongs[i];
 		unsigned long val2 = set2->ulongs[i];
 		if (val1 == val2)
diff --git a/ext/hwloc/hwloc/components.c b/ext/hwloc/hwloc/components.c
index 7aa3b9da2..5c2879b64 100644
--- a/ext/hwloc/hwloc/components.c
+++ b/ext/hwloc/hwloc/components.c
@@ -1,17 +1,19 @@
 /*
- * Copyright © 2009-2015 Inria.  All rights reserved.
- * Copyright © 2012 Université Bordeau 1
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2012 Université Bordeaux
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/xml.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/xml.h"
+#include "private/misc.h"
 
 #define HWLOC_COMPONENT_STOP_NAME "stop"
 #define HWLOC_COMPONENT_EXCLUDE_CHAR '-'
 #define HWLOC_COMPONENT_SEPS ","
+#define HWLOC_COMPONENT_PHASESEP_CHAR ':'
 
 /* list of all registered discovery components, sorted by priority, higher priority first.
  * noos is last because its priority is 0.
@@ -24,6 +26,7 @@ static unsigned hwloc_components_users = 0; /* first one initializes, last ones
 static int hwloc_components_verbose = 0;
 #ifdef HWLOC_HAVE_PLUGINS
 static int hwloc_plugins_verbose = 0;
+static const char * hwloc_plugins_blacklist = NULL;
 #endif
 
 /* hwloc_components_mutex serializes:
@@ -76,7 +79,6 @@ hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
 {
   const char *basename;
   lt_dlhandle handle;
-  char *componentsymbolname = NULL;
   struct hwloc_component *component;
   struct hwloc__plugin_desc *desc, **prevdesc;
 
@@ -89,6 +91,12 @@ hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
   else
     basename++;
 
+  if (hwloc_plugins_blacklist && strstr(hwloc_plugins_blacklist, basename)) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin `%s' is blacklisted in the environment\n", basename);
+    goto out;
+  }
+
   /* dlopen and get the component structure */
   handle = lt_dlopenext(filename);
   if (!handle) {
@@ -96,7 +104,9 @@ hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
       fprintf(stderr, "Failed to load plugin: %s\n", lt_dlerror());
     goto out;
   }
-  componentsymbolname = malloc(strlen(basename)+10+1);
+
+{
+  char componentsymbolname[strlen(basename)+10+1];
   sprintf(componentsymbolname, "%s_component", basename);
   component = lt_dlsym(handle, componentsymbolname);
   if (!component) {
@@ -107,15 +117,14 @@ hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
   }
   if (component->abi != HWLOC_COMPONENT_ABI) {
     if (hwloc_plugins_verbose)
-      fprintf(stderr, "Plugin symbol ABI %u instead of %u\n",
+      fprintf(stderr, "Plugin symbol ABI %u instead of %d\n",
 	      component->abi, HWLOC_COMPONENT_ABI);
     goto out_with_handle;
   }
   if (hwloc_plugins_verbose)
     fprintf(stderr, "Plugin contains expected symbol `%s'\n",
 	    componentsymbolname);
-  free(componentsymbolname);
-  componentsymbolname = NULL;
+}
 
   if (HWLOC_COMPONENT_TYPE_DISC == component->type) {
     if (strncmp(basename, "hwloc_", 6)) {
@@ -159,7 +168,6 @@ hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
 
  out_with_handle:
   lt_dlclose(handle);
-  free(componentsymbolname); /* NULL if already freed */
  out:
   return 0;
 }
@@ -190,13 +198,15 @@ static int
 hwloc_plugins_init(void)
 {
   const char *verboseenv;
-  char *path = HWLOC_PLUGINS_PATH;
+  const char *path = HWLOC_PLUGINS_PATH;
   const char *env;
   int err;
 
   verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
   hwloc_plugins_verbose = verboseenv ? atoi(verboseenv) : 0;
 
+  hwloc_plugins_blacklist = getenv("HWLOC_PLUGINS_BLACKLIST");
+
   err = lt_dlinit();
   if (err)
     goto out;
@@ -223,17 +233,6 @@ hwloc_plugins_init(void)
 
 #endif /* HWLOC_HAVE_PLUGINS */
 
-static const char *
-hwloc_disc_component_type_string(hwloc_disc_component_type_t type)
-{
-  switch (type) {
-  case HWLOC_DISC_COMPONENT_TYPE_CPU: return "cpu";
-  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL: return "global";
-  case HWLOC_DISC_COMPONENT_TYPE_MISC: return "misc";
-  default: return "**unknown**";
-  }
-}
-
 static int
 hwloc_disc_component_register(struct hwloc_disc_component *component,
 			      const char *filename)
@@ -247,21 +246,26 @@ hwloc_disc_component_register(struct hwloc_disc_component *component,
     return -1;
   }
   if (strchr(component->name, HWLOC_COMPONENT_EXCLUDE_CHAR)
+      || strchr(component->name, HWLOC_COMPONENT_PHASESEP_CHAR)
       || strcspn(component->name, HWLOC_COMPONENT_SEPS) != strlen(component->name)) {
     if (hwloc_components_verbose)
       fprintf(stderr, "Cannot register discovery component with name `%s' containing reserved characters `%c" HWLOC_COMPONENT_SEPS "'\n",
 	      component->name, HWLOC_COMPONENT_EXCLUDE_CHAR);
     return -1;
   }
-  /* check that the component type is valid */
-  switch ((unsigned) component->type) {
-  case HWLOC_DISC_COMPONENT_TYPE_CPU:
-  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL:
-  case HWLOC_DISC_COMPONENT_TYPE_MISC:
-    break;
-  default:
-    fprintf(stderr, "Cannot register discovery component `%s' with unknown type %u\n",
-	    component->name, (unsigned) component->type);
+
+  /* check that the component phases are valid */
+  if (!component->phases
+      || (component->phases != HWLOC_DISC_PHASE_GLOBAL
+	  && component->phases & ~(HWLOC_DISC_PHASE_CPU
+				   |HWLOC_DISC_PHASE_MEMORY
+				   |HWLOC_DISC_PHASE_PCI
+				   |HWLOC_DISC_PHASE_IO
+				   |HWLOC_DISC_PHASE_MISC
+				   |HWLOC_DISC_PHASE_ANNOTATE
+				   |HWLOC_DISC_PHASE_TWEAK))) {
+    fprintf(stderr, "Cannot register discovery component `%s' with invalid phases 0x%x\n",
+	    component->name, component->phases);
     return -1;
   }
 
@@ -286,8 +290,8 @@ hwloc_disc_component_register(struct hwloc_disc_component *component,
     prev = &((*prev)->next);
   }
   if (hwloc_components_verbose)
-    fprintf(stderr, "Registered %s discovery component `%s' with priority %u (%s%s)\n",
-	    hwloc_disc_component_type_string(component->type), component->name, component->priority,
+    fprintf(stderr, "Registered discovery component `%s' phases 0x%x with priority %u (%s%s)\n",
+	    component->name, component->phases, component->priority,
 	    filename ? "from plugin " : "statically build", filename ? filename : "");
 
   prev = &hwloc_disc_components;
@@ -301,13 +305,13 @@ hwloc_disc_component_register(struct hwloc_disc_component *component,
   return 0;
 }
 
-#include <static-components.h>
+#include "static-components.h"
 
 static void (**hwloc_component_finalize_cbs)(unsigned long);
 static unsigned hwloc_component_finalize_cb_count;
 
 void
-hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
+hwloc_components_init(void)
 {
 #ifdef HWLOC_HAVE_PLUGINS
   struct hwloc__plugin_desc *desc;
@@ -319,7 +323,7 @@ hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
   assert((unsigned) -1 != hwloc_components_users);
   if (0 != hwloc_components_users++) {
     HWLOC_COMPONENTS_UNLOCK();
-    goto ok;
+    return;
   }
 
   verboseenv = getenv("HWLOC_COMPONENTS_VERBOSE");
@@ -367,8 +371,8 @@ hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
     /* register for real now */
     if (HWLOC_COMPONENT_TYPE_DISC == hwloc_static_components[i]->type)
       hwloc_disc_component_register(hwloc_static_components[i]->data, NULL);
-    /*else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
-      hwloc_xml_callbacks_register(hwloc_static_components[i]->data);*/
+    else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
+      hwloc_xml_callbacks_register(hwloc_static_components[i]->data);
     else
       assert(0);
   }
@@ -395,38 +399,163 @@ hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
     /* register for real now */
     if (HWLOC_COMPONENT_TYPE_DISC == desc->component->type)
       hwloc_disc_component_register(desc->component->data, desc->filename);
-    /*else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
-      hwloc_xml_callbacks_register(desc->component->data);*/
+    else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
+      hwloc_xml_callbacks_register(desc->component->data);
     else
       assert(0);
   }
 #endif
 
   HWLOC_COMPONENTS_UNLOCK();
+}
+
+void
+hwloc_topology_components_init(struct hwloc_topology *topology)
+{
+  topology->nr_blacklisted_components = 0;
+  topology->blacklisted_components = NULL;
 
- ok:
   topology->backends = NULL;
+  topology->backend_phases = 0;
+  topology->backend_excluded_phases = 0;
 }
 
+/* look for name among components, ignoring things after `:' */
 static struct hwloc_disc_component *
-hwloc_disc_component_find(int type /* hwloc_disc_component_type_t or -1 if any */,
-			       const char *name /* name of NULL if any */)
+hwloc_disc_component_find(const char *name, const char **endp)
 {
-  struct hwloc_disc_component *comp = hwloc_disc_components;
+  struct hwloc_disc_component *comp;
+  size_t length;
+  const char *end = strchr(name, HWLOC_COMPONENT_PHASESEP_CHAR);
+  if (end) {
+    length = end-name;
+    if (endp)
+      *endp = end+1;
+  } else {
+    length = strlen(name);
+    if (endp)
+      *endp = NULL;
+  }
+
+  comp = hwloc_disc_components;
   while (NULL != comp) {
-    if ((-1 == type || type == (int) comp->type)
-       && (NULL == name || !strcmp(name, comp->name)))
+    if (!strncmp(name, comp->name, length))
       return comp;
     comp = comp->next;
   }
   return NULL;
 }
 
+static unsigned
+hwloc_phases_from_string(const char *s)
+{
+  if (!s)
+    return ~0U;
+  if (s[0]<'0' || s[0]>'9') {
+    if (!strcasecmp(s, "global"))
+      return HWLOC_DISC_PHASE_GLOBAL;
+    else if (!strcasecmp(s, "cpu"))
+      return HWLOC_DISC_PHASE_CPU;
+    if (!strcasecmp(s, "memory"))
+      return HWLOC_DISC_PHASE_MEMORY;
+    if (!strcasecmp(s, "pci"))
+      return HWLOC_DISC_PHASE_PCI;
+    if (!strcasecmp(s, "io"))
+      return HWLOC_DISC_PHASE_IO;
+    if (!strcasecmp(s, "misc"))
+      return HWLOC_DISC_PHASE_MISC;
+    if (!strcasecmp(s, "annotate"))
+      return HWLOC_DISC_PHASE_ANNOTATE;
+    if (!strcasecmp(s, "tweak"))
+      return HWLOC_DISC_PHASE_TWEAK;
+    return 0;
+  }
+  return (unsigned) strtoul(s, NULL, 0);
+}
+
+static int
+hwloc_disc_component_blacklist_one(struct hwloc_topology *topology,
+				   const char *name)
+{
+  struct hwloc_topology_forced_component_s *blacklisted;
+  struct hwloc_disc_component *comp;
+  unsigned phases;
+  unsigned i;
+
+  if (!strcmp(name, "linuxpci") || !strcmp(name, "linuxio")) {
+    /* replace linuxpci and linuxio with linux (with IO phases)
+     * for backward compatibility with pre-v2.0 and v2.0 respectively */
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Replacing deprecated component `%s' with `linux' IO phases in blacklisting\n", name);
+    comp = hwloc_disc_component_find("linux", NULL);
+    phases = HWLOC_DISC_PHASE_PCI | HWLOC_DISC_PHASE_IO | HWLOC_DISC_PHASE_MISC | HWLOC_DISC_PHASE_ANNOTATE;
+
+  } else {
+    /* normal lookup */
+    const char *end;
+    comp = hwloc_disc_component_find(name, &end);
+    phases = hwloc_phases_from_string(end);
+  }
+  if (!comp) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Blacklisting component `%s` phases 0x%x\n", comp->name, phases);
+
+  for(i=0; i<topology->nr_blacklisted_components; i++) {
+    if (topology->blacklisted_components[i].component == comp) {
+      topology->blacklisted_components[i].phases |= phases;
+      return 0;
+    }
+  }
+
+  blacklisted = realloc(topology->blacklisted_components, (topology->nr_blacklisted_components+1)*sizeof(*blacklisted));
+  if (!blacklisted)
+    return -1;
+
+  blacklisted[topology->nr_blacklisted_components].component = comp;
+  blacklisted[topology->nr_blacklisted_components].phases = phases;
+  topology->blacklisted_components = blacklisted;
+  topology->nr_blacklisted_components++;
+  return 0;
+}
+
+int
+hwloc_topology_set_components(struct hwloc_topology *topology,
+			      unsigned long flags,
+			      const char *name)
+{
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  if (flags & ~HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* this flag is strictly required for now */
+  if (flags != HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!strncmp(name, "all", 3) && name[3] == HWLOC_COMPONENT_PHASESEP_CHAR) {
+    topology->backend_excluded_phases = hwloc_phases_from_string(name+4);
+    return 0;
+  }
+
+  return hwloc_disc_component_blacklist_one(topology, name);
+}
+
 /* used by set_xml(), set_synthetic(), ... environment variables, ... to force the first backend */
 int
 hwloc_disc_component_force_enable(struct hwloc_topology *topology,
 				  int envvar_forced,
-				  int type, const char *name,
+				  const char *name,
 				  const void *data1, const void *data2, const void *data3)
 {
   struct hwloc_disc_component *comp;
@@ -437,18 +566,28 @@ hwloc_disc_component_force_enable(struct hwloc_topology *topology,
     return -1;
   }
 
-  comp = hwloc_disc_component_find(type, name);
+  comp = hwloc_disc_component_find(name, NULL);
   if (!comp) {
     errno = ENOSYS;
     return -1;
   }
 
-  backend = comp->instantiate(comp, data1, data2, data3);
+  backend = comp->instantiate(topology, comp, 0U /* force-enabled don't get any phase blacklisting */,
+			      data1, data2, data3);
   if (backend) {
+    int err;
     backend->envvar_forced = envvar_forced;
     if (topology->backends)
       hwloc_backends_disable_all(topology);
-    return hwloc_backend_enable(topology, backend);
+    err = hwloc_backend_enable(backend);
+
+    if (comp->phases == HWLOC_DISC_PHASE_GLOBAL) {
+      char *env = getenv("HWLOC_ANNOTATE_GLOBAL_COMPONENTS");
+      if (env && atoi(env))
+	topology->backend_excluded_phases &= ~HWLOC_DISC_PHASE_ANNOTATE;
+    }
+
+    return err;
   } else
     return -1;
 }
@@ -456,36 +595,32 @@ hwloc_disc_component_force_enable(struct hwloc_topology *topology,
 static int
 hwloc_disc_component_try_enable(struct hwloc_topology *topology,
 				struct hwloc_disc_component *comp,
-				const char *comparg,
-				unsigned *excludes,
 				int envvar_forced,
-				int verbose_errors)
+				unsigned blacklisted_phases)
 {
   struct hwloc_backend *backend;
-  int err;
 
-  if ((*excludes) & comp->type) {
-    if (hwloc_components_verbose || verbose_errors)
-      fprintf(stderr, "Excluding %s discovery component `%s', conflicts with excludes 0x%x\n",
-	      hwloc_disc_component_type_string(comp->type), comp->name, *excludes);
+  if (!(comp->phases & ~(topology->backend_excluded_phases | blacklisted_phases))) {
+    /* all this backend phases are already excluded, exclude the backend entirely */
+    if (hwloc_components_verbose)
+      /* do not warn if envvar_forced since system-wide HWLOC_COMPONENTS must be silently ignored after set_xml() etc.
+       */
+      fprintf(stderr, "Excluding discovery component `%s' phases 0x%x, conflicts with excludes 0x%x\n",
+	      comp->name, comp->phases, topology->backend_excluded_phases);
     return -1;
   }
 
-  backend = comp->instantiate(comp, comparg, NULL, NULL);
+  backend = comp->instantiate(topology, comp, topology->backend_excluded_phases | blacklisted_phases,
+			      NULL, NULL, NULL);
   if (!backend) {
-    if (hwloc_components_verbose || verbose_errors)
+    if (hwloc_components_verbose || envvar_forced)
       fprintf(stderr, "Failed to instantiate discovery component `%s'\n", comp->name);
     return -1;
   }
 
+  backend->phases &= ~blacklisted_phases;
   backend->envvar_forced = envvar_forced;
-  err = hwloc_backend_enable(topology, backend);
-  if (err < 0)
-    return -1;
-
-  *excludes |= comp->excludes;
-
-  return 0;
+  return hwloc_backend_enable(backend);
 }
 
 void
@@ -493,39 +628,60 @@ hwloc_disc_components_enable_others(struct hwloc_topology *topology)
 {
   struct hwloc_disc_component *comp;
   struct hwloc_backend *backend;
-  unsigned excludes = 0;
   int tryall = 1;
   const char *_env;
   char *env; /* we'll to modify the env value, so duplicate it */
+  unsigned i;
 
   _env = getenv("HWLOC_COMPONENTS");
   env = _env ? strdup(_env) : NULL;
 
-  /* compute current excludes */
-  backend = topology->backends;
-  while (backend) {
-    excludes |= backend->component->excludes;
-    backend = backend->next;
-  }
-
-  /* enable explicitly listed components */
+  /* blacklist disabled components */
   if (env) {
     char *curenv = env;
     size_t s;
 
-    if (topology->backends) {
-      hwloc_backends_disable_all(topology);
-      excludes = 0;
-    }
-
     while (*curenv) {
       s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
       if (s) {
 	char c;
 
-	if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR)
+	if (curenv[0] != HWLOC_COMPONENT_EXCLUDE_CHAR)
 	  goto nextname;
 
+	/* save the last char and replace with \0 */
+	c = curenv[s];
+	curenv[s] = '\0';
+
+	/* blacklist it, and just ignore failures to allocate */
+	hwloc_disc_component_blacklist_one(topology, curenv+1);
+
+	/* remove that blacklisted name from the string */
+	for(i=0; i<s; i++)
+	  curenv[i] = *HWLOC_COMPONENT_SEPS;
+
+	/* restore chars (the second loop below needs env to be unmodified) */
+	curenv[s] = c;
+      }
+
+    nextname:
+      curenv += s;
+      if (*curenv)
+	/* Skip comma */
+	curenv++;
+    }
+  }
+
+  /* enable explicitly listed components */
+  if (env) {
+    char *curenv = env;
+    size_t s;
+
+    while (*curenv) {
+      s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+      if (s) {
+	char c, *name;
+
 	if (!strncmp(curenv, HWLOC_COMPONENT_STOP_NAME, s)) {
 	  tryall = 0;
 	  break;
@@ -535,18 +691,31 @@ hwloc_disc_components_enable_others(struct hwloc_topology *topology)
 	c = curenv[s];
 	curenv[s] = '\0';
 
-	comp = hwloc_disc_component_find(-1, curenv);
+	name = curenv;
+	if (!strcmp(name, "linuxpci") || !strcmp(name, "linuxio")) {
+	  if (hwloc_components_verbose)
+	    fprintf(stderr, "Replacing deprecated component `%s' with `linux' in envvar forcing\n", name);
+	  name = "linux";
+	}
+
+	comp = hwloc_disc_component_find(name, NULL /* we enable the entire component, phases must be blacklisted separately */);
 	if (comp) {
-	  hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 1 /* envvar forced */, 1 /* envvar forced need warnings */);
+	  unsigned blacklisted_phases = 0U;
+	  for(i=0; i<topology->nr_blacklisted_components; i++)
+	    if (comp == topology->blacklisted_components[i].component) {
+	      blacklisted_phases = topology->blacklisted_components[i].phases;
+	      break;
+	    }
+	  if (comp->phases & ~blacklisted_phases)
+	    hwloc_disc_component_try_enable(topology, comp, 1 /* envvar forced */, blacklisted_phases);
 	} else {
-	  fprintf(stderr, "Cannot find discovery component `%s'\n", curenv);
+	  fprintf(stderr, "Cannot find discovery component `%s'\n", name);
 	}
 
 	/* restore chars (the second loop below needs env to be unmodified) */
 	curenv[s] = c;
       }
 
-nextname:
       curenv += s;
       if (*curenv)
 	/* Skip comma */
@@ -560,24 +729,24 @@ hwloc_disc_components_enable_others(struct hwloc_topology *topology)
   if (tryall) {
     comp = hwloc_disc_components;
     while (NULL != comp) {
-      /* check if this component was explicitly excluded in env */
-      if (env) {
-	char *curenv = env;
-	while (*curenv) {
-	  size_t s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
-	  if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, comp->name, s-1)) {
-	    if (hwloc_components_verbose)
-	      fprintf(stderr, "Excluding %s discovery component `%s' because of HWLOC_COMPONENTS environment variable\n",
-	    hwloc_disc_component_type_string(comp->type), comp->name);
-	    goto nextcomp;
-	  }
-	  curenv += s;
-	  if (*curenv)
-	    /* Skip comma */
-	    curenv++;
+      unsigned blacklisted_phases = 0U;
+      if (!comp->enabled_by_default)
+	goto nextcomp;
+      /* check if this component was blacklisted by the application */
+      for(i=0; i<topology->nr_blacklisted_components; i++)
+	if (comp == topology->blacklisted_components[i].component) {
+	  blacklisted_phases = topology->blacklisted_components[i].phases;
+	  break;
 	}
+
+      if (!(comp->phases & ~blacklisted_phases)) {
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Excluding blacklisted discovery component `%s' phases 0x%x\n",
+		  comp->name, comp->phases);
+	goto nextcomp;
       }
-      hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 0 /* defaults, not envvar forced */, 0 /* defaults don't need warnings on conflicts */);
+
+      hwloc_disc_component_try_enable(topology, comp, 0 /* defaults, not envvar forced */, blacklisted_phases);
 nextcomp:
       comp = comp->next;
     }
@@ -589,19 +758,18 @@ hwloc_disc_components_enable_others(struct hwloc_topology *topology)
     backend = topology->backends;
     fprintf(stderr, "Final list of enabled discovery components: ");
     while (backend != NULL) {
-      fprintf(stderr, "%s%s", first ? "" : ",", backend->component->name);
+      fprintf(stderr, "%s%s(0x%x)", first ? "" : ",", backend->component->name, backend->phases);
       backend = backend->next;
       first = 0;
     }
     fprintf(stderr, "\n");
   }
 
-  if (env)
-    free(env);
+  free(env);
 }
 
 void
-hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_unused)
+hwloc_components_fini(void)
 {
   unsigned i;
 
@@ -621,7 +789,7 @@ hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_u
   /* no need to unlink/free the list of components, they'll be unloaded below */
 
   hwloc_disc_components = NULL;
-//  hwloc_xml_callbacks_reset();
+  hwloc_xml_callbacks_reset();
 
 #ifdef HWLOC_HAVE_PLUGINS
   hwloc_plugins_exit();
@@ -631,7 +799,8 @@ hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_u
 }
 
 struct hwloc_backend *
-hwloc_backend_alloc(struct hwloc_disc_component *component)
+hwloc_backend_alloc(struct hwloc_topology *topology,
+		    struct hwloc_disc_component *component)
 {
   struct hwloc_backend * backend = malloc(sizeof(*backend));
   if (!backend) {
@@ -639,10 +808,15 @@ hwloc_backend_alloc(struct hwloc_disc_component *component)
     return NULL;
   }
   backend->component = component;
+  backend->topology = topology;
+  /* filter-out component phases that are excluded */
+  backend->phases = component->phases & ~topology->backend_excluded_phases;
+  if (backend->phases != component->phases && hwloc_components_verbose)
+    fprintf(stderr, "Trying discovery component `%s' with phases 0x%x instead of 0x%x\n",
+	    component->name, backend->phases, component->phases);
   backend->flags = 0;
   backend->discover = NULL;
-  backend->get_obj_cpuset = NULL;
-  backend->notify_new_object = NULL;
+  backend->get_pci_busid_cpuset = NULL;
   backend->disable = NULL;
   backend->is_thissystem = -1;
   backend->next = NULL;
@@ -659,14 +833,15 @@ hwloc_backend_disable(struct hwloc_backend *backend)
 }
 
 int
-hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend)
+hwloc_backend_enable(struct hwloc_backend *backend)
 {
+  struct hwloc_topology *topology = backend->topology;
   struct hwloc_backend **pprev;
 
   /* check backend flags */
-  if (backend->flags & (~(HWLOC_BACKEND_FLAG_NEED_LEVELS))) {
-    fprintf(stderr, "Cannot enable %s discovery component `%s' with unknown flags %lx\n",
-	    hwloc_disc_component_type_string(backend->component->type), backend->component->name, backend->flags);
+  if (backend->flags) {
+    fprintf(stderr, "Cannot enable discovery component `%s' phases 0x%x with unknown flags %lx\n",
+	    backend->component->name, backend->component->phases, backend->flags);
     return -1;
   }
 
@@ -675,8 +850,8 @@ hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *back
   while (NULL != *pprev) {
     if ((*pprev)->component == backend->component) {
       if (hwloc_components_verbose)
-	fprintf(stderr, "Cannot enable %s discovery component `%s' twice\n",
-		hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+	fprintf(stderr, "Cannot enable  discovery component `%s' phases 0x%x twice\n",
+		backend->component->name, backend->component->phases);
       hwloc_backend_disable(backend);
       errno = EBUSY;
       return -1;
@@ -685,8 +860,8 @@ hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *back
   }
 
   if (hwloc_components_verbose)
-    fprintf(stderr, "Enabling %s discovery component `%s'\n",
-	    hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+    fprintf(stderr, "Enabling discovery component `%s' with phases 0x%x (among 0x%x)\n",
+	    backend->component->name, backend->phases, backend->component->phases);
 
   /* enqueue at the end */
   pprev = &topology->backends;
@@ -695,8 +870,8 @@ hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *back
   backend->next = *pprev;
   *pprev = backend;
 
-  backend->topology = topology;
-
+  topology->backend_phases |= backend->component->phases;
+  topology->backend_excluded_phases |= backend->component->excluded_phases;
   return 0;
 }
 
@@ -706,7 +881,7 @@ hwloc_backends_is_thissystem(struct hwloc_topology *topology)
   struct hwloc_backend *backend;
   const char *local_env;
 
-  /* Apply is_thissystem topology flag before we enforce envvar backends.
+  /*
    * If the application changed the backend with set_foo(),
    * it may use set_flags() update the is_thissystem flag here.
    * If it changes the backend with environment variables below,
@@ -745,34 +920,20 @@ hwloc_backends_is_thissystem(struct hwloc_topology *topology)
     topology->is_thissystem = atoi(local_env);
 }
 
-int
-hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+void
+hwloc_backends_find_callbacks(struct hwloc_topology *topology)
 {
-  struct hwloc_topology *topology = caller->topology;
   struct hwloc_backend *backend = topology->backends;
-  /* use the first backend's get_obj_cpuset callback */
+  /* use the first backend's get_pci_busid_cpuset callback */
+  topology->get_pci_busid_cpuset_backend = NULL;
   while (backend != NULL) {
-    if (backend->get_obj_cpuset)
-      return backend->get_obj_cpuset(backend, caller, obj, cpuset);
-    backend = backend->next;
-  }
-  return -1;
-}
-
-int
-hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj)
-{
-  struct hwloc_backend *backend;
-  int res = 0;
-
-  backend = caller->topology->backends;
-  while (NULL != backend) {
-    if (backend != caller && backend->notify_new_object)
-      res += backend->notify_new_object(backend, caller, obj);
+    if (backend->get_pci_busid_cpuset) {
+      topology->get_pci_busid_cpuset_backend = backend;
+      return;
+    }
     backend = backend->next;
   }
-
-  return res;
+  return;
 }
 
 void
@@ -783,10 +944,20 @@ hwloc_backends_disable_all(struct hwloc_topology *topology)
   while (NULL != (backend = topology->backends)) {
     struct hwloc_backend *next = backend->next;
     if (hwloc_components_verbose)
-      fprintf(stderr, "Disabling %s discovery component `%s'\n",
-	      hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+      fprintf(stderr, "Disabling discovery component `%s'\n",
+	      backend->component->name);
     hwloc_backend_disable(backend);
     topology->backends = next;
   }
   topology->backends = NULL;
+  topology->backend_excluded_phases = 0;
+}
+
+void
+hwloc_topology_components_fini(struct hwloc_topology *topology)
+{
+  /* hwloc_backends_disable_all() must have been called earlier */
+  assert(!topology->backends);
+
+  free(topology->blacklisted_components);
 }
diff --git a/ext/hwloc/hwloc/diff.c b/ext/hwloc/hwloc/diff.c
index ee401d264..7794358bb 100644
--- a/ext/hwloc/hwloc/diff.c
+++ b/ext/hwloc/hwloc/diff.c
@@ -1,14 +1,13 @@
 /*
- * Copyright © 2013-2015 Inria.  All rights reserved.
+ * Copyright © 2013-2019 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <private/private.h>
-#include <private/misc.h>
+#include "private/autogen/config.h"
+#include "private/private.h"
+#include "private/misc.h"
 
-int hwloc_topology_diff_destroy(hwloc_topology_t topology __hwloc_attribute_unused,
-				hwloc_topology_diff_t diff)
+int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff)
 {
 	hwloc_topology_diff_t next;
 	while (diff) {
@@ -127,8 +126,12 @@ hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
 
 	if (obj1->depth != obj2->depth)
 		goto out_too_complex;
+
 	if (obj1->type != obj2->type)
 		goto out_too_complex;
+	if ((!obj1->subtype) != (!obj2->subtype)
+	    || (obj1->subtype && strcmp(obj1->subtype, obj2->subtype)))
+		goto out_too_complex;
 
 	if (obj1->os_index != obj2->os_index)
 		/* we could allow different os_index for non-PU non-NUMAnode objects
@@ -141,15 +144,15 @@ hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
 #define SETS_DIFFERENT(_set, _obj1, _obj2) _SETS_DIFFERENT((_obj1)->_set, (_obj2)->_set)
 	if (SETS_DIFFERENT(cpuset, obj1, obj2)
 	    || SETS_DIFFERENT(complete_cpuset, obj1, obj2)
-	    || SETS_DIFFERENT(allowed_cpuset, obj1, obj2)
 	    || SETS_DIFFERENT(nodeset, obj1, obj2)
-	    || SETS_DIFFERENT(complete_nodeset, obj1, obj2)
-	    || SETS_DIFFERENT(allowed_nodeset, obj1, obj2))
+	    || SETS_DIFFERENT(complete_nodeset, obj1, obj2))
 		goto out_too_complex;
 
 	/* no need to check logical_index, sibling_rank, symmetric_subtree,
 	 * the parents did it */
 
+	/* gp_index don't have to be strictly identical */
+
 	if ((!obj1->name) != (!obj2->name)
 	    || (obj1->name && strcmp(obj1->name, obj2->name))) {
 		err = hwloc_append_diff_obj_attr_string(obj1,
@@ -162,24 +165,31 @@ hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
 			return err;
 	}
 
-	/* memory */
-	if (obj1->memory.local_memory != obj2->memory.local_memory) {
-		err = hwloc_append_diff_obj_attr_uint64(obj1,
-						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
-						       0,
-						       obj1->memory.local_memory,
-						       obj2->memory.local_memory,
-						       firstdiffp, lastdiffp);
-		if (err < 0)
-			return err;
-	}
-	/* ignore memory page_types */
-
 	/* type-specific attrs */
 	switch (obj1->type) {
 	default:
 		break;
-	case HWLOC_OBJ_CACHE:
+	case HWLOC_OBJ_NUMANODE:
+		if (obj1->attr->numanode.local_memory != obj2->attr->numanode.local_memory) {
+			err = hwloc_append_diff_obj_attr_uint64(obj1,
+								HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+								0,
+								obj1->attr->numanode.local_memory,
+								obj2->attr->numanode.local_memory,
+								firstdiffp, lastdiffp);
+			if (err < 0)
+				return err;
+		}
+		/* ignore memory page_types */
+		break;
+	case HWLOC_OBJ_L1CACHE:
+	case HWLOC_OBJ_L2CACHE:
+	case HWLOC_OBJ_L3CACHE:
+	case HWLOC_OBJ_L4CACHE:
+	case HWLOC_OBJ_L5CACHE:
+	case HWLOC_OBJ_L1ICACHE:
+	case HWLOC_OBJ_L2ICACHE:
+	case HWLOC_OBJ_L3ICACHE:
 		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->cache)))
 			goto out_too_complex;
 		break;
@@ -201,32 +211,20 @@ hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
 		break;
 	}
 
-	/* distances */
-	if (obj1->distances_count != obj2->distances_count)
-		goto out_too_complex;
-	for(i=0; i<obj1->distances_count; i++) {
-		struct hwloc_distances_s *d1 = obj1->distances[i], *d2 = obj2->distances[i];
-		if (d1->relative_depth != d2->relative_depth
-		    || d1->nbobjs != d2->nbobjs
-		    || d1->latency_max != d2->latency_max
-		    || d1->latency_base != d2->latency_base
-		    || memcmp(d1->latency, d2->latency, d1->nbobjs * d1->nbobjs * sizeof(*d1->latency)))
-			goto out_too_complex;
-	}
-
 	/* infos */
 	if (obj1->infos_count != obj2->infos_count)
 		goto out_too_complex;
 	for(i=0; i<obj1->infos_count; i++) {
-		if (strcmp(obj1->infos[i].name, obj2->infos[i].name))
+		struct hwloc_info_s *info1 = &obj1->infos[i], *info2 = &obj2->infos[i];
+		if (strcmp(info1->name, info2->name))
 			goto out_too_complex;
 		if (strcmp(obj1->infos[i].value, obj2->infos[i].value)) {
 			err = hwloc_append_diff_obj_attr_string(obj1,
-							       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
-							       obj1->infos[i].name,
-							       obj1->infos[i].value,
-							       obj2->infos[i].value,
-							       firstdiffp, lastdiffp);
+								HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
+								info1->name,
+								info1->value,
+								info2->value,
+								firstdiffp, lastdiffp);
 			if (err < 0)
 				return err;
 		}
@@ -248,6 +246,20 @@ hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
 	if (child1 || child2)
 		goto out_too_complex;
 
+	/* memory children */
+	for(child1 = obj1->memory_first_child, child2 = obj2->memory_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
 	/* I/O children */
 	for(child1 = obj1->io_first_child, child2 = obj2->io_first_child;
 	    child1 != NULL && child2 != NULL;
@@ -289,8 +301,15 @@ int hwloc_topology_diff_build(hwloc_topology_t topo1,
 			      hwloc_topology_diff_t *diffp)
 {
 	hwloc_topology_diff_t lastdiff, tmpdiff;
+	struct hwloc_internal_distances_s *dist1, *dist2;
+	unsigned i;
 	int err;
 
+	if (!topo1->is_loaded || !topo2->is_loaded) {
+	  errno = EINVAL;
+	  return -1;
+	}
+
 	if (flags != 0) {
 		errno = EINVAL;
 		return -1;
@@ -301,7 +320,6 @@ int hwloc_topology_diff_build(hwloc_topology_t topo1,
 			       topo2, hwloc_get_root_obj(topo2),
 			       flags,
 			       diffp, &lastdiff);
-
 	if (!err) {
 		tmpdiff = *diffp;
 		while (tmpdiff) {
@@ -313,6 +331,47 @@ int hwloc_topology_diff_build(hwloc_topology_t topo1,
 		}
 	}
 
+	if (!err) {
+		if (SETS_DIFFERENT(allowed_cpuset, topo1, topo2)
+		    || SETS_DIFFERENT(allowed_nodeset, topo1, topo2)) {
+			hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+			err = 1;
+		}
+	}
+
+	if (!err) {
+		/* distances */
+		hwloc_internal_distances_refresh(topo1);
+		hwloc_internal_distances_refresh(topo2);
+		dist1 = topo1->first_dist;
+		dist2 = topo2->first_dist;
+		while (dist1 || dist2) {
+			if (!!dist1 != !!dist2) {
+				hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+				err = 1;
+				break;
+			}
+			if (dist1->unique_type != dist2->unique_type
+			    || dist1->different_types || dist2->different_types /* too lazy to support this case */
+			    || dist1->nbobjs != dist2->nbobjs
+			    || dist1->kind != dist2->kind
+			    || memcmp(dist1->values, dist2->values, dist1->nbobjs * dist1->nbobjs * sizeof(*dist1->values))) {
+				hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+				err = 1;
+				break;
+			}
+			for(i=0; i<dist1->nbobjs; i++)
+				/* gp_index isn't enforced above. so compare logical_index instead, which is enforced. requires distances refresh() above */
+				if (dist1->objs[i]->logical_index != dist2->objs[i]->logical_index) {
+					hwloc_append_diff_too_complex(hwloc_get_root_obj(topo1), diffp, &lastdiff);
+					err = 1;
+					break;
+				}
+			dist1 = dist1->next;
+			dist2 = dist2->next;
+		}
+	}
+
 	return err;
 }
 
@@ -340,12 +399,14 @@ hwloc_apply_diff_one(hwloc_topology_t topology,
 			hwloc_uint64_t oldvalue = reverse ? obj_attr->diff.uint64.newvalue : obj_attr->diff.uint64.oldvalue;
 			hwloc_uint64_t newvalue = reverse ? obj_attr->diff.uint64.oldvalue : obj_attr->diff.uint64.newvalue;
 			hwloc_uint64_t valuediff = newvalue - oldvalue;
-			if (obj->memory.local_memory != oldvalue)
+			if (obj->type != HWLOC_OBJ_NUMANODE)
+				return -1;
+			if (obj->attr->numanode.local_memory != oldvalue)
 				return -1;
-			obj->memory.local_memory = newvalue;
+			obj->attr->numanode.local_memory = newvalue;
 			tmpobj = obj;
 			while (tmpobj) {
-				tmpobj->memory.total_memory += valuediff;
+				tmpobj->total_memory += valuediff;
 				tmpobj = tmpobj->parent;
 			}
 			break;
@@ -366,10 +427,11 @@ hwloc_apply_diff_one(hwloc_topology_t topology,
 			unsigned i;
 			int found = 0;
 			for(i=0; i<obj->infos_count; i++) {
-				if (!strcmp(obj->infos[i].name, name)
-				    && !strcmp(obj->infos[i].value, oldvalue)) {
-					free(obj->infos[i].value);
-					obj->infos[i].value = strdup(newvalue);
+				struct hwloc_info_s *info = &obj->infos[i];
+				if (!strcmp(info->name, name)
+				    && !strcmp(info->value, oldvalue)) {
+					free(info->value);
+					info->value = strdup(newvalue);
 					found = 1;
 					break;
 				}
@@ -398,6 +460,15 @@ int hwloc_topology_diff_apply(hwloc_topology_t topology,
 	hwloc_topology_diff_t tmpdiff, tmpdiff2;
 	int err, nr;
 
+	if (!topology->is_loaded) {
+	  errno = EINVAL;
+	  return -1;
+	}
+	if (topology->adopted_shmem_addr) {
+	  errno = EPERM;
+	  return -1;
+	}
+
 	if (flags & ~HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE) {
 		errno = EINVAL;
 		return -1;
diff --git a/ext/hwloc/hwloc/distances.c b/ext/hwloc/hwloc/distances.c
index 51382b10e..9e56a9696 100644
--- a/ext/hwloc/hwloc/distances.c
+++ b/ext/hwloc/hwloc/distances.c
@@ -1,689 +1,938 @@
 /*
- * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2019 Inria.  All rights reserved.
  * Copyright © 2011-2012 Université Bordeaux
  * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/debug.h"
+#include "private/misc.h"
 
 #include <float.h>
 #include <math.h>
 
-/**************************
- * Main Init/Clear/Destroy
+static struct hwloc_internal_distances_s *
+hwloc__internal_distances_from_public(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/******************************************************
+ * Global init, prepare, destroy, dup
  */
 
-/* called during topology init */
-void hwloc_distances_init(struct hwloc_topology *topology)
+/* called during topology init() */
+void hwloc_internal_distances_init(struct hwloc_topology *topology)
+{
+  topology->first_dist = topology->last_dist = NULL;
+  topology->next_dist_id = 0;
+}
+
+/* called at the beginning of load() */
+void hwloc_internal_distances_prepare(struct hwloc_topology *topology)
+{
+  char *env;
+  hwloc_localeswitch_declare;
+
+  topology->grouping = 1;
+  if (topology->type_filter[HWLOC_OBJ_GROUP] == HWLOC_TYPE_FILTER_KEEP_NONE)
+    topology->grouping = 0;
+  env = getenv("HWLOC_GROUPING");
+  if (env && !atoi(env))
+    topology->grouping = 0;
+
+  if (topology->grouping) {
+    topology->grouping_next_subkind = 0;
+
+    HWLOC_BUILD_ASSERT(sizeof(topology->grouping_accuracies)/sizeof(*topology->grouping_accuracies) == 5);
+    topology->grouping_accuracies[0] = 0.0f;
+    topology->grouping_accuracies[1] = 0.01f;
+    topology->grouping_accuracies[2] = 0.02f;
+    topology->grouping_accuracies[3] = 0.05f;
+    topology->grouping_accuracies[4] = 0.1f;
+    topology->grouping_nbaccuracies = 5;
+
+    hwloc_localeswitch_init();
+    env = getenv("HWLOC_GROUPING_ACCURACY");
+    if (!env) {
+      /* only use 0.0 */
+      topology->grouping_nbaccuracies = 1;
+    } else if (strcmp(env, "try")) {
+      /* use the given value */
+      topology->grouping_nbaccuracies = 1;
+      topology->grouping_accuracies[0] = (float) atof(env);
+    } /* otherwise try all values */
+    hwloc_localeswitch_fini();
+
+    topology->grouping_verbose = 0;
+    env = getenv("HWLOC_GROUPING_VERBOSE");
+    if (env)
+      topology->grouping_verbose = atoi(env);
+  }
+}
+
+static void hwloc_internal_distances_free(struct hwloc_internal_distances_s *dist)
 {
-  topology->first_osdist = topology->last_osdist = NULL;
+  free(dist->name);
+  free(dist->different_types);
+  free(dist->indexes);
+  free(dist->objs);
+  free(dist->values);
+  free(dist);
 }
 
 /* called during topology destroy */
-void hwloc_distances_destroy(struct hwloc_topology * topology)
+void hwloc_internal_distances_destroy(struct hwloc_topology * topology)
 {
-  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
-  while ((osdist = next) != NULL) {
-    next = osdist->next;
-    /* remove final distance matrics AND physically-ordered ones */
-    free(osdist->indexes);
-    free(osdist->objs);
-    free(osdist->distances);
-    free(osdist);
+  struct hwloc_internal_distances_s *dist, *next = topology->first_dist;
+  while ((dist = next) != NULL) {
+    next = dist->next;
+    hwloc_internal_distances_free(dist);
   }
-  topology->first_osdist = topology->last_osdist = NULL;
+  topology->first_dist = topology->last_dist = NULL;
 }
 
-/******************************************************
- * Inserting distances in the topology
- * from a backend, from the environment or by the user
- */
+static int hwloc_internal_distances_dup_one(struct hwloc_topology *new, struct hwloc_internal_distances_s *olddist)
+{
+  struct hwloc_tma *tma = new->tma;
+  struct hwloc_internal_distances_s *newdist;
+  unsigned nbobjs = olddist->nbobjs;
 
-/* insert a distance matrix in the topology.
- * the caller gives us those pointers, we take care of freeing them later and so on.
- */
-void hwloc_distances_set(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
-			 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances,
-			 int force)
-{
-  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
-  /* look for existing distances for the same type */
-  while ((osdist = next) != NULL) {
-    next = osdist->next;
-    if (osdist->type == type) {
-      if (osdist->forced && !force) {
-	/* there is a forced distance element, ignore the new non-forced one */
-	free(indexes);
-	free(objs);
-	free(distances);
-	return;
-      } else if (force) {
-	/* we're forcing a new distance, remove the old ones */
-	free(osdist->indexes);
-	free(osdist->objs);
-	free(osdist->distances);
-	/* remove current object */
-	if (osdist->prev)
-	  osdist->prev->next = next;
-	else
-	  topology->first_osdist = next;
-	if (next)
-	  next->prev = osdist->prev;
-	else
-	  topology->last_osdist = osdist->prev;
-	/* free current object */
-	free(osdist);
-      }
+  newdist = hwloc_tma_malloc(tma, sizeof(*newdist));
+  if (!newdist)
+    return -1;
+  if (olddist->name) {
+    newdist->name = hwloc_tma_strdup(tma, olddist->name);
+    if (!newdist->name) {
+      assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+      hwloc_internal_distances_free(newdist);
+      return -1;
     }
+  } else {
+    newdist->name = NULL;
   }
 
-  if (!nbobjs)
-    /* we're just clearing, return now */
-    return;
+  if (olddist->different_types) {
+    newdist->different_types = hwloc_tma_malloc(tma, nbobjs * sizeof(*newdist->different_types));
+    if (!newdist->different_types) {
+      assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+      hwloc_internal_distances_free(newdist);
+      return -1;
+    }
+    memcpy(newdist->different_types, olddist->different_types, nbobjs * sizeof(*newdist->different_types));
+  } else
+    newdist->different_types = NULL;
+  newdist->unique_type = olddist->unique_type;
+  newdist->nbobjs = nbobjs;
+  newdist->kind = olddist->kind;
+  newdist->id = olddist->id;
+
+  newdist->indexes = hwloc_tma_malloc(tma, nbobjs * sizeof(*newdist->indexes));
+  newdist->objs = hwloc_tma_calloc(tma, nbobjs * sizeof(*newdist->objs));
+  newdist->iflags = olddist->iflags & ~HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID; /* must be revalidated after dup() */
+  newdist->values = hwloc_tma_malloc(tma, nbobjs*nbobjs * sizeof(*newdist->values));
+  if (!newdist->indexes || !newdist->objs || !newdist->values) {
+    assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+    hwloc_internal_distances_free(newdist);
+    return -1;
+  }
 
-  /* create the new element */
-  osdist = malloc(sizeof(struct hwloc_os_distances_s));
-  osdist->nbobjs = nbobjs;
-  osdist->indexes = indexes;
-  osdist->objs = objs;
-  osdist->distances = distances;
-  osdist->forced = force;
-  osdist->type = type;
-  /* insert it */
-  osdist->next = NULL;
-  osdist->prev = topology->last_osdist;
-  if (topology->last_osdist)
-    topology->last_osdist->next = osdist;
+  memcpy(newdist->indexes, olddist->indexes, nbobjs * sizeof(*newdist->indexes));
+  memcpy(newdist->values, olddist->values, nbobjs*nbobjs * sizeof(*newdist->values));
+
+  newdist->next = NULL;
+  newdist->prev = new->last_dist;
+  if (new->last_dist)
+    new->last_dist->next = newdist;
   else
-    topology->first_osdist = osdist;
-  topology->last_osdist = osdist;
+    new->first_dist = newdist;
+  new->last_dist = newdist;
+
+  return 0;
 }
 
-/* make sure a user-given distance matrix is sane */
-static int hwloc_distances__check_matrix(hwloc_topology_t __hwloc_restrict topology __hwloc_attribute_unused, hwloc_obj_type_t type __hwloc_attribute_unused,
-					 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs __hwloc_attribute_unused, float *distances __hwloc_attribute_unused)
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+int hwloc_internal_distances_dup(struct hwloc_topology *new, struct hwloc_topology *old)
 {
-  unsigned i,j;
-  /* make sure we don't have the same index twice */
-  for(i=0; i<nbobjs; i++)
-    for(j=i+1; j<nbobjs; j++)
-      if (indexes[i] == indexes[j]) {
-	errno = EINVAL;
-	return -1;
-      }
+  struct hwloc_internal_distances_s *olddist;
+  int err;
+  new->next_dist_id = old->next_dist_id;
+  for(olddist = old->first_dist; olddist; olddist = olddist->next) {
+    err = hwloc_internal_distances_dup_one(new, olddist);
+    if (err < 0)
+      return err;
+  }
   return 0;
 }
 
-static void hwloc_distances__set_from_string(struct hwloc_topology *topology,
-					     hwloc_obj_type_t type, const char *string)
+/******************************************************
+ * Remove distances from the topology
+ */
+
+int hwloc_distances_remove(hwloc_topology_t topology)
 {
-  /* the string format is: "index[0],...,index[N-1]:distance[0],...,distance[N*N-1]"
-   * or "index[0],...,index[N-1]:X*Y" or "index[0],...,index[N-1]:X*Y*Z"
-   */
-  const char *tmp = string, *next;
-  unsigned *indexes;
-  float *distances;
-  unsigned nbobjs = 0, i, j, x, y, z;
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return -1;
+  }
+  hwloc_internal_distances_destroy(topology);
+  return 0;
+}
 
-  if (!strcmp(string, "none")) {
-    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
-    return;
+int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth)
+{
+  struct hwloc_internal_distances_s *dist, *next;
+  hwloc_obj_type_t type;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return -1;
   }
 
-  if (sscanf(string, "%u-%u:", &i, &j) == 2) {
-    /* range i-j */
-    nbobjs = j-i+1;
-    indexes = calloc(nbobjs, sizeof(unsigned));
-    distances = calloc(nbobjs*nbobjs, sizeof(float));
-    /* make sure the user didn't give a veeeeery large range */
-    if (!indexes || !distances) {
-      free(indexes);
-      free(distances);
-      return;
-    }
-    for(j=0; j<nbobjs; j++)
-      indexes[j] = j+i;
-    tmp = strchr(string, ':') + 1;
+  /* switch back to types since we don't support groups for now */
+  type = hwloc_get_depth_type(topology, depth);
+  if (type == (hwloc_obj_type_t)-1) {
+    errno = EINVAL;
+    return -1;
+  }
 
-  } else {
-    /* explicit list of indexes, count them */
-    while (1) {
-      size_t size = strspn(tmp, "0123456789");
-      if (tmp[size] != ',') {
-	/* last element */
-	tmp += size;
-	nbobjs++;
-	break;
-      }
-      /* another index */
-      tmp += size+1;
-      nbobjs++;
+  next = topology->first_dist;
+  while ((dist = next) != NULL) {
+    next = dist->next;
+    if (dist->unique_type == type) {
+      if (next)
+	next->prev = dist->prev;
+      else
+	topology->last_dist = dist->prev;
+      if (dist->prev)
+	dist->prev->next = dist->next;
+      else
+	topology->first_dist = dist->next;
+      hwloc_internal_distances_free(dist);
     }
+  }
 
-    if (*tmp != ':') {
-      fprintf(stderr, "Ignoring %s distances from environment variable, missing colon\n",
-	      hwloc_obj_type_string(type));
-      return;
-    }
+  return 0;
+}
 
-    indexes = calloc(nbobjs, sizeof(unsigned));
-    distances = calloc(nbobjs*nbobjs, sizeof(float));
-    tmp = string;
+int hwloc_distances_release_remove(hwloc_topology_t topology,
+				   struct hwloc_distances_s *distances)
+{
+  struct hwloc_internal_distances_s *dist = hwloc__internal_distances_from_public(topology, distances);
+  if (!dist) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (dist->prev)
+    dist->prev->next = dist->next;
+  else
+    topology->first_dist = dist->next;
+  if (dist->next)
+    dist->next->prev = dist->prev;
+  else
+    topology->last_dist = dist->prev;
+  hwloc_internal_distances_free(dist);
+  hwloc_distances_release(topology, distances);
+  return 0;
+}
 
-    /* parse indexes */
-    for(i=0; i<nbobjs; i++) {
-      indexes[i] = strtoul(tmp, (char **) &next, 0);
-      tmp = next+1;
-    }
+/******************************************************
+ * Add distances to the topology
+ */
+
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology, unsigned nbobjs, struct hwloc_obj **objs, uint64_t *values, unsigned long kind, unsigned nbaccuracies, float *accuracies, int needcheck);
+
+/* insert a distance matrix in the topology.
+ * the caller gives us the distances and objs pointers, we'll free them later.
+ */
+static int
+hwloc_internal_distances__add(hwloc_topology_t topology, const char *name,
+			      hwloc_obj_type_t unique_type, hwloc_obj_type_t *different_types,
+			      unsigned nbobjs, hwloc_obj_t *objs, uint64_t *indexes, uint64_t *values,
+			      unsigned long kind, unsigned iflags)
+{
+  struct hwloc_internal_distances_s *dist;
+
+  if (different_types) {
+    kind |= HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES; /* the user isn't forced to give it */
+  } else if (kind & HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES) {
+    errno = EINVAL;
+    goto err;
   }
 
+  dist = calloc(1, sizeof(*dist));
+  if (!dist)
+    goto err;
 
-  /* parse distances */
-  z=1; /* default if sscanf finds only 2 values below */
-  if (sscanf(tmp, "%u*%u*%u", &x, &y, &z) >= 2) {
-    /* generate the matrix to create x groups of y elements */
-    if (x*y*z != nbobjs) {
-      fprintf(stderr, "Ignoring %s distances from environment variable, invalid grouping (%u*%u*%u=%u instead of %u)\n",
-	      hwloc_obj_type_string(type), x, y, z, x*y*z, nbobjs);
-      free(indexes);
-      free(distances);
-      return;
-    }
-    for(i=0; i<nbobjs; i++)
-      for(j=0; j<nbobjs; j++)
-	if (i==j)
-	  distances[i*nbobjs+j] = 1;
-	else if (i/z == j/z)
-	  distances[i*nbobjs+j] = 2;
-	else if (i/z/y == j/z/y)
-	  distances[i*nbobjs+j] = 4;
-	else
-	  distances[i*nbobjs+j] = 8;
+  if (name)
+    dist->name = strdup(name); /* ignore failure */
+
+  dist->unique_type = unique_type;
+  dist->different_types = different_types;
+  dist->nbobjs = nbobjs;
+  dist->kind = kind;
+  dist->iflags = iflags;
+
+  assert(!!(iflags & HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID) == !!objs);
+
+  if (!objs) {
+    assert(indexes);
+    /* we only have indexes, we'll refresh objs from there */
+    dist->indexes = indexes;
+    dist->objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+    if (!dist->objs)
+      goto err_with_dist;
 
   } else {
-    /* parse a comma separated list of distances */
-    for(i=0; i<nbobjs*nbobjs; i++) {
-      distances[i] = (float) atof(tmp);
-      next = strchr(tmp, ',');
-      if (next) {
-        tmp = next+1;
-      } else if (i!=nbobjs*nbobjs-1) {
-	fprintf(stderr, "Ignoring %s distances from environment variable, not enough values (%u out of %u)\n",
-		hwloc_obj_type_string(type), i+1, nbobjs*nbobjs);
-	free(indexes);
-	free(distances);
-	return;
-      }
+    unsigned i;
+    assert(!indexes);
+    /* we only have objs, generate the indexes arrays so that we can refresh objs later */
+    dist->objs = objs;
+    dist->indexes = malloc(nbobjs * sizeof(*dist->indexes));
+    if (!dist->indexes)
+      goto err_with_dist;
+    if (HWLOC_DIST_TYPE_USE_OS_INDEX(dist->unique_type)) {
+      for(i=0; i<nbobjs; i++)
+	dist->indexes[i] = objs[i]->os_index;
+    } else {
+      for(i=0; i<nbobjs; i++)
+	dist->indexes[i] = objs[i]->gp_index;
     }
   }
 
-  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0) {
-    fprintf(stderr, "Ignoring invalid %s distances from environment variable\n", hwloc_obj_type_string(type));
-    free(indexes);
-    free(distances);
-    return;
+  dist->values = values;
+
+  dist->id = topology->next_dist_id++;
+
+  if (topology->last_dist)
+    topology->last_dist->next = dist;
+  else
+    topology->first_dist = dist;
+  dist->prev = topology->last_dist;
+  dist->next = NULL;
+  topology->last_dist = dist;
+  return 0;
+
+ err_with_dist:
+  free(dist);
+ err:
+  free(different_types);
+  free(objs);
+  free(indexes);
+  free(values);
+  return -1;
+}
+
+int hwloc_internal_distances_add_by_index(hwloc_topology_t topology, const char *name,
+					  hwloc_obj_type_t unique_type, hwloc_obj_type_t *different_types, unsigned nbobjs, uint64_t *indexes, uint64_t *values,
+					  unsigned long kind, unsigned long flags)
+{
+  unsigned iflags = 0; /* objs not valid */
+
+  if (nbobjs < 2) {
+    errno = EINVAL;
+    goto err;
+  }
+
+  /* cannot group without objects,
+   * and we don't group from XML anyway since the hwloc that generated the XML should have grouped already.
+   */
+  if (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP) {
+    errno = EINVAL;
+    goto err;
   }
 
-  hwloc_distances_set(topology, type, nbobjs, indexes, NULL, distances, 1 /* force */);
+  return hwloc_internal_distances__add(topology, name, unique_type, different_types, nbobjs, NULL, indexes, values, kind, iflags);
+
+ err:
+  free(indexes);
+  free(values);
+  free(different_types);
+  return -1;
 }
 
-/* take distances in the environment, store them as is in the topology.
- * we'll convert them into object later once the tree is filled
- */
-void hwloc_distances_set_from_env(struct hwloc_topology *topology)
+static void
+hwloc_internal_distances_restrict(hwloc_obj_t *objs,
+				  uint64_t *indexes,
+				  uint64_t *values,
+				  unsigned nbobjs, unsigned disappeared);
+
+int hwloc_internal_distances_add(hwloc_topology_t topology, const char *name,
+				 unsigned nbobjs, hwloc_obj_t *objs, uint64_t *values,
+				 unsigned long kind, unsigned long flags)
 {
-  hwloc_obj_type_t type;
-  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
-    const char *env;
-    char envname[64];
-    snprintf(envname, sizeof(envname), "HWLOC_%s_DISTANCES", hwloc_obj_type_string(type));
-    env = getenv(envname);
-    if (env) {
-      hwloc_localeswitch_declare;
-      hwloc_localeswitch_init();
-      hwloc_distances__set_from_string(topology, type, env);
-      hwloc_localeswitch_fini();
+  hwloc_obj_type_t unique_type, *different_types;
+  unsigned i, disappeared = 0;
+  unsigned iflags = HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID;
+
+  if (nbobjs < 2) {
+    errno = EINVAL;
+    goto err;
+  }
+
+  /* is there any NULL object? (useful in case of problem during insert in backends) */
+  for(i=0; i<nbobjs; i++)
+    if (!objs[i])
+      disappeared++;
+  if (disappeared) {
+    /* some objects are NULL */
+    if (disappeared == nbobjs) {
+      /* nothing left, drop the matrix */
+      free(objs);
+      free(values);
+      return 0;
+    }
+    /* restrict the matrix */
+    hwloc_internal_distances_restrict(objs, NULL, values, nbobjs, disappeared);
+    nbobjs -= disappeared;
+  }
+
+  unique_type = objs[0]->type;
+  for(i=1; i<nbobjs; i++)
+    if (objs[i]->type != unique_type) {
+      unique_type = HWLOC_OBJ_TYPE_NONE;
+      break;
+    }
+  if (unique_type == HWLOC_OBJ_TYPE_NONE) {
+    /* heterogeneous types */
+    different_types = malloc(nbobjs * sizeof(*different_types));
+    if (!different_types)
+      goto err;
+    for(i=0; i<nbobjs; i++)
+      different_types[i] = objs[i]->type;
+
+  } else {
+    /* homogeneous types */
+    different_types = NULL;
+  }
+
+  if (topology->grouping && (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP) && !different_types) {
+    float full_accuracy = 0.f;
+    float *accuracies;
+    unsigned nbaccuracies;
+
+    if (flags & HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE) {
+      accuracies = topology->grouping_accuracies;
+      nbaccuracies = topology->grouping_nbaccuracies;
+    } else {
+      accuracies = &full_accuracy;
+      nbaccuracies = 1;
     }
+
+    if (topology->grouping_verbose) {
+      unsigned j;
+      int gp = !HWLOC_DIST_TYPE_USE_OS_INDEX(unique_type);
+      fprintf(stderr, "Trying to group objects using distance matrix:\n");
+      fprintf(stderr, "%s", gp ? "gp_index" : "os_index");
+      for(j=0; j<nbobjs; j++)
+	fprintf(stderr, " % 5d", (int)(gp ? objs[j]->gp_index : objs[j]->os_index));
+      fprintf(stderr, "\n");
+      for(i=0; i<nbobjs; i++) {
+	fprintf(stderr, "  % 5d", (int)(gp ? objs[i]->gp_index : objs[i]->os_index));
+	for(j=0; j<nbobjs; j++)
+	  fprintf(stderr, " % 5lld", (long long) values[i*nbobjs + j]);
+	fprintf(stderr, "\n");
+      }
+    }
+
+    hwloc__groups_by_distances(topology, nbobjs, objs, values,
+			       kind, nbaccuracies, accuracies, 1 /* check the first matrice */);
   }
+
+  return hwloc_internal_distances__add(topology, name, unique_type, different_types, nbobjs, objs, NULL, values, kind, iflags);
+
+ err:
+  free(objs);
+  free(values);
+  return -1;
 }
 
-/* The actual set() function exported to the user
- *
- * take the given distance, store them as is in the topology.
- * we'll convert them into object later once the tree is filled.
+#define HWLOC_DISTANCES_KIND_FROM_ALL (HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_FROM_USER)
+#define HWLOC_DISTANCES_KIND_MEANS_ALL (HWLOC_DISTANCES_KIND_MEANS_LATENCY|HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH)
+#define HWLOC_DISTANCES_KIND_ALL (HWLOC_DISTANCES_KIND_FROM_ALL|HWLOC_DISTANCES_KIND_MEANS_ALL)
+#define HWLOC_DISTANCES_ADD_FLAG_ALL (HWLOC_DISTANCES_ADD_FLAG_GROUP|HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE)
+
+/* The actual function exported to the user
  */
-int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
-				       unsigned nbobjs, unsigned *indexes, float *distances)
+int hwloc_distances_add(hwloc_topology_t topology,
+			unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values,
+			unsigned long kind, unsigned long flags)
 {
-  unsigned *_indexes;
-  float *_distances;
+  unsigned i;
+  uint64_t *_values;
+  hwloc_obj_t *_objs;
+  int err;
 
-  if (!nbobjs && !indexes && !distances) {
-    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
-    return 0;
+  if (nbobjs < 2 || !objs || !values || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
   }
-
-  if (!nbobjs || !indexes || !distances)
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
     return -1;
-
-  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0)
+  }
+  if ((kind & ~HWLOC_DISTANCES_KIND_ALL)
+      || hwloc_weight_long(kind & HWLOC_DISTANCES_KIND_FROM_ALL) != 1
+      || hwloc_weight_long(kind & HWLOC_DISTANCES_KIND_MEANS_ALL) != 1
+      || (flags & ~HWLOC_DISTANCES_ADD_FLAG_ALL)) {
+    errno = EINVAL;
     return -1;
+  }
+
+  /* no strict need to check for duplicates, things shouldn't break */
+
+  for(i=1; i<nbobjs; i++)
+    if (!objs[i]) {
+      errno = EINVAL;
+      return -1;
+    }
 
   /* copy the input arrays and give them to the topology */
-  _indexes = malloc(nbobjs*sizeof(unsigned));
-  memcpy(_indexes, indexes, nbobjs*sizeof(unsigned));
-  _distances = malloc(nbobjs*nbobjs*sizeof(float));
-  memcpy(_distances, distances, nbobjs*nbobjs*sizeof(float));
-  hwloc_distances_set(topology, type, nbobjs, _indexes, NULL, _distances, 1 /* force */);
+  _objs = malloc(nbobjs*sizeof(hwloc_obj_t));
+  _values = malloc(nbobjs*nbobjs*sizeof(*_values));
+  if (!_objs || !_values)
+    goto out_with_arrays;
+
+  memcpy(_objs, objs, nbobjs*sizeof(hwloc_obj_t));
+  memcpy(_values, values, nbobjs*nbobjs*sizeof(*_values));
+  err = hwloc_internal_distances_add(topology, NULL, nbobjs, _objs, _values, kind, flags);
+  if (err < 0)
+    goto out; /* _objs and _values freed in hwloc_internal_distances_add() */
+
+  /* in case we added some groups, see if we need to reconnect */
+  hwloc_topology_reconnect(topology, 0);
 
   return 0;
+
+ out_with_arrays:
+  free(_values);
+  free(_objs);
+ out:
+  return -1;
 }
 
-/************************
- * Restricting distances
+/******************************************************
+ * Refresh objects in distances
  */
 
-/* called when some objects have been removed because empty/ignored/cgroup/restrict,
- * we must rebuild the list of objects from indexes (in hwloc_distances_finalize_os())
- */
-void hwloc_distances_restrict_os(struct hwloc_topology *topology)
+static hwloc_obj_t hwloc_find_obj_by_depth_and_gp_index(hwloc_topology_t topology, unsigned depth, uint64_t gp_index)
 {
-  struct hwloc_os_distances_s * osdist;
-  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
-    /* remove the objs array, we'll rebuild it from the indexes
-     * depending on remaining objects */
-    free(osdist->objs);
-    osdist->objs = NULL;
+  hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+  while (obj) {
+    if (obj->gp_index == gp_index)
+      return obj;
+    obj = obj->next_cousin;
   }
+  return NULL;
 }
 
-
-/* cleanup everything we created from distances so that we may rebuild them
- * at the end of restrict()
- */
-void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags)
+static hwloc_obj_t hwloc_find_obj_by_type_and_gp_index(hwloc_topology_t topology, hwloc_obj_type_t type, uint64_t gp_index)
 {
-  if (flags & HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES) {
-    /* some objects may have been removed, clear objects arrays so that finalize_os rebuilds them properly */
-    hwloc_distances_restrict_os(topology);
-  } else {
-    /* if not adapting distances, drop everything */
-    hwloc_distances_destroy(topology);
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return NULL;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE) {
+    int topodepth = hwloc_topology_get_depth(topology);
+    for(depth=0; depth<topodepth; depth++) {
+      if (hwloc_get_depth_type(topology, depth) == type) {
+	hwloc_obj_t obj = hwloc_find_obj_by_depth_and_gp_index(topology, depth, gp_index);
+	if (obj)
+	  return obj;
+      }
+    }
+    return NULL;
   }
+  return hwloc_find_obj_by_depth_and_gp_index(topology, depth, gp_index);
 }
 
-/**************************************************************
- * Convert user/env given array of indexes into actual objects
- */
-
-static hwloc_obj_t hwloc_find_obj_by_type_and_os_index(hwloc_obj_t root, hwloc_obj_type_t type, unsigned os_index)
+static void
+hwloc_internal_distances_restrict(hwloc_obj_t *objs,
+				  uint64_t *indexes,
+				  uint64_t *values,
+				  unsigned nbobjs, unsigned disappeared)
 {
-  hwloc_obj_t child;
-  if (root->type == type && root->os_index == os_index)
-    return root;
-  child = root->first_child;
-  while (child) {
-    hwloc_obj_t found = hwloc_find_obj_by_type_and_os_index(child, type, os_index);
-    if (found)
-      return found;
-    child = child->next_sibling;
-  }
-  return NULL;
+  unsigned i, newi;
+  unsigned j, newj;
+
+  for(i=0, newi=0; i<nbobjs; i++)
+    if (objs[i]) {
+      for(j=0, newj=0; j<nbobjs; j++)
+	if (objs[j]) {
+	  values[newi*(nbobjs-disappeared)+newj] = values[i*nbobjs+j];
+	  newj++;
+	}
+      newi++;
+    }
+
+  for(i=0, newi=0; i<nbobjs; i++)
+    if (objs[i]) {
+      objs[newi] = objs[i];
+      if (indexes)
+	indexes[newi] = indexes[i];
+      newi++;
+    }
 }
 
-/* convert distance indexes that were previously stored in the topology
- * into actual objects if not done already.
- * it's already done when distances come from backends (this function should not be called then).
- * it's not done when distances come from the user.
- *
- * returns -1 if the matrix was invalid
- */
 static int
-hwloc_distances__finalize_os(struct hwloc_topology *topology, struct hwloc_os_distances_s *osdist)
+hwloc_internal_distances_refresh_one(hwloc_topology_t topology,
+				     struct hwloc_internal_distances_s *dist)
 {
-  unsigned nbobjs = osdist->nbobjs;
-  unsigned *indexes = osdist->indexes;
-  float *distances = osdist->distances;
-  unsigned i, j;
-  hwloc_obj_type_t type = osdist->type;
-  hwloc_obj_t *objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+  hwloc_obj_type_t unique_type = dist->unique_type;
+  hwloc_obj_type_t *different_types = dist->different_types;
+  unsigned nbobjs = dist->nbobjs;
+  hwloc_obj_t *objs = dist->objs;
+  uint64_t *indexes = dist->indexes;
+  unsigned disappeared = 0;
+  unsigned i;
 
-  assert(!osdist->objs);
+  if (dist->iflags & HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID)
+    return 0;
 
-  /* traverse the topology and look for the relevant objects */
   for(i=0; i<nbobjs; i++) {
-    hwloc_obj_t obj = hwloc_find_obj_by_type_and_os_index(topology->levels[0][0], type, indexes[i]);
-    if (!obj) {
-
-      /* shift the matrix */
-#define OLDPOS(i,j) (distances+(i)*nbobjs+(j))
-#define NEWPOS(i,j) (distances+(i)*(nbobjs-1)+(j))
-      if (i>0) {
-	/** no need to move beginning of 0th line */
-	for(j=0; j<i-1; j++)
-	  /** move end of jth line + beginning of (j+1)th line */
-	  memmove(NEWPOS(j,i), OLDPOS(j,i+1), (nbobjs-1)*sizeof(*distances));
-	/** move end of (i-1)th line */
-	memmove(NEWPOS(i-1,i), OLDPOS(i-1,i+1), (nbobjs-i-1)*sizeof(*distances));
-      }
-      if (i<nbobjs-1) {
-	/** move beginning of (i+1)th line */
-	memmove(NEWPOS(i,0), OLDPOS(i+1,0), i*sizeof(*distances));
-	/** move end of jth line + beginning of (j+1)th line */
-	for(j=i; j<nbobjs-2; j++)
-	  memmove(NEWPOS(j,i), OLDPOS(j+1,i+1), (nbobjs-1)*sizeof(*distances));
-	/** move end of (nbobjs-2)th line */
-	memmove(NEWPOS(nbobjs-2,i), OLDPOS(nbobjs-1,i+1), (nbobjs-i-1)*sizeof(*distances));
-      }
-
-      /* shift the indexes array */
-      memmove(indexes+i, indexes+i+1, (nbobjs-i-1)*sizeof(*indexes));
-
-      /* update counters */
-      nbobjs--;
-      i--;
-      continue;
+    hwloc_obj_t obj;
+    /* TODO use cpuset/nodeset to find pus/numas from the root?
+     * faster than traversing the entire level?
+     */
+    if (HWLOC_DIST_TYPE_USE_OS_INDEX(unique_type)) {
+      if (unique_type == HWLOC_OBJ_PU)
+	obj = hwloc_get_pu_obj_by_os_index(topology, (unsigned) indexes[i]);
+      else if (unique_type == HWLOC_OBJ_NUMANODE)
+	obj = hwloc_get_numanode_obj_by_os_index(topology, (unsigned) indexes[i]);
+      else
+	abort();
+    } else {
+      obj = hwloc_find_obj_by_type_and_gp_index(topology, different_types ? different_types[i] : unique_type, indexes[i]);
     }
     objs[i] = obj;
+    if (!obj)
+      disappeared++;
   }
 
-  osdist->nbobjs = nbobjs;
-  if (!nbobjs) {
-    /* the whole matrix was invalid, let the caller remove this distances */
-    free(objs);
+  if (nbobjs-disappeared < 2)
+    /* became useless, drop */
     return -1;
+
+  if (disappeared) {
+    hwloc_internal_distances_restrict(objs, dist->indexes, dist->values, nbobjs, disappeared);
+    dist->nbobjs -= disappeared;
   }
 
-  /* setup the objs array */
-  osdist->objs = objs;
+  dist->iflags |= HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID;
   return 0;
 }
 
-
-void hwloc_distances_finalize_os(struct hwloc_topology *topology)
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+void
+hwloc_internal_distances_refresh(hwloc_topology_t topology)
 {
-  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
-  while ((osdist = next) != NULL) {
-    int err;
-    next = osdist->next;
-
-    /* remove final distance matrics AND physically-ordered ones */
-
-    if (osdist->objs)
-      /* nothing to do, switch to the next element */
+  struct hwloc_internal_distances_s *dist, *next;
+
+  for(dist = topology->first_dist; dist; dist = next) {
+    next = dist->next;
+
+    if (hwloc_internal_distances_refresh_one(topology, dist) < 0) {
+      assert(!topology->tma || !topology->tma->dontfree); /* this tma cannot fail to allocate */
+      if (dist->prev)
+	dist->prev->next = next;
+      else
+	topology->first_dist = next;
+      if (next)
+	next->prev = dist->prev;
+      else
+	topology->last_dist = dist->prev;
+      hwloc_internal_distances_free(dist);
       continue;
+    }
+  }
+}
 
-    err = hwloc_distances__finalize_os(topology, osdist);
-    if (!err)
-      /* convert ok, switch to the next element */
-      continue;
+void
+hwloc_internal_distances_invalidate_cached_objs(hwloc_topology_t topology)
+{
+  struct hwloc_internal_distances_s *dist;
+  for(dist = topology->first_dist; dist; dist = dist->next)
+    dist->iflags &= ~HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID;
+}
 
-    /* remove this element */
-    free(osdist->indexes);
-    free(osdist->distances);
-    /* remove current object */
-    if (osdist->prev)
-      osdist->prev->next = next;
-    else
-      topology->first_osdist = next;
-    if (next)
-      next->prev = osdist->prev;
-    else
-      topology->last_osdist = osdist->prev;
-    /* free current object */
-    free(osdist);
-  }
-}
-
-/***********************************************************
- * Convert internal distances given by the backend/env/user
- * into exported logical distances attached to objects
+/******************************************************
+ * User API for getting distances
  */
 
-static void
-hwloc_distances__finalize_logical(struct hwloc_topology *topology,
-				  unsigned nbobjs,
-				  hwloc_obj_t *objs, float *osmatrix)
-{
-  unsigned i, j, li, lj, minl;
-  float min = FLT_MAX, max = FLT_MIN;
-  hwloc_obj_t root;
-  float *matrix;
-  hwloc_cpuset_t cpuset, complete_cpuset;
-  hwloc_nodeset_t nodeset, complete_nodeset;
-  unsigned relative_depth;
-  int idx;
-
-  /* find the root */
-  cpuset = hwloc_bitmap_alloc();
-  complete_cpuset = hwloc_bitmap_alloc();
-  nodeset = hwloc_bitmap_alloc();
-  complete_nodeset = hwloc_bitmap_alloc();
-  for(i=0; i<nbobjs; i++) {
-    hwloc_bitmap_or(cpuset, cpuset, objs[i]->cpuset);
-    hwloc_bitmap_or(complete_cpuset, complete_cpuset, objs[i]->complete_cpuset);
-    hwloc_bitmap_or(nodeset, nodeset, objs[i]->nodeset);
-    hwloc_bitmap_or(complete_nodeset, complete_nodeset, objs[i]->complete_nodeset);
-  }
-  /* find the object covering cpuset, we'll take care of the nodeset later */
-  root = hwloc_get_obj_covering_cpuset(topology, cpuset);
-  /* walk up to find a parent that also covers the nodeset and complete sets */
-  while (root &&
-	 (!hwloc_bitmap_isincluded(nodeset, root->nodeset)
-	  || !hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset)
-	  || !hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset)))
-    root = root->parent;
-  if (!root) {
-    /* should not happen, ignore the distance matrix and report an error. */
-    if (!hwloc_hide_errors()) {
-      char *a, *b;
-      hwloc_bitmap_asprintf(&a, cpuset);
-      hwloc_bitmap_asprintf(&b, nodeset);
-      fprintf(stderr, "****************************************************************************\n");
-      fprintf(stderr, "* hwloc %s has encountered an error when adding a distance matrix to the topology.\n", HWLOC_VERSION);
-      fprintf(stderr, "*\n");
-      fprintf(stderr, "* hwloc_distances__finalize_logical() could not find any object covering\n");
-      fprintf(stderr, "* cpuset %s and nodeset %s\n", a, b);
-      fprintf(stderr, "*\n");
-      fprintf(stderr, "* Please report this error message to the hwloc user's mailing list,\n");
-#ifdef HWLOC_LINUX_SYS
-      fprintf(stderr, "* along with the output from the hwloc-gather-topology script.\n");
-#else
-      fprintf(stderr, "* along with any relevant topology information from your platform.\n");
-#endif
-      fprintf(stderr, "****************************************************************************\n");
-      free(a);
-      free(b);
-    }
-    hwloc_bitmap_free(cpuset);
-    hwloc_bitmap_free(complete_cpuset);
-    hwloc_bitmap_free(nodeset);
-    hwloc_bitmap_free(complete_nodeset);
-    return;
-  }
-  /* ideally, root has the exact cpuset and nodeset.
-   * but ignoring or other things that remove objects may cause the object array to reduce */
-  assert(hwloc_bitmap_isincluded(cpuset, root->cpuset));
-  assert(hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset));
-  assert(hwloc_bitmap_isincluded(nodeset, root->nodeset));
-  assert(hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset));
-  hwloc_bitmap_free(cpuset);
-  hwloc_bitmap_free(complete_cpuset);
-  hwloc_bitmap_free(nodeset);
-  hwloc_bitmap_free(complete_nodeset);
-  if (root->depth >= objs[0]->depth) {
-    /* strange topology led us to find invalid relative depth, ignore */
-    return;
-  }
-  relative_depth = objs[0]->depth - root->depth; /* this assume that we have distances between objects of the same level */
+/* what we actually allocate for user queries, even if we only
+ * return the distances part of it.
+ */
+struct hwloc_distances_container_s {
+  unsigned id;
+  struct hwloc_distances_s distances;
+};
 
-  if (nbobjs != hwloc_get_nbobjs_inside_cpuset_by_depth(topology, root->cpuset, root->depth + relative_depth))
-    /* the root does not cover the right number of objects, maybe we failed to insert a root (bad intersect or so). */
-    return;
+#define HWLOC_DISTANCES_CONTAINER_OFFSET ((char*)&((struct hwloc_distances_container_s*)NULL)->distances - (char*)NULL)
+#define HWLOC_DISTANCES_CONTAINER(_d) (struct hwloc_distances_container_s *) ( ((char*)_d) - HWLOC_DISTANCES_CONTAINER_OFFSET )
 
-  /* get the logical index offset, it's the min of all logical indexes */
-  minl = UINT_MAX;
-  for(i=0; i<nbobjs; i++)
-    if (minl > objs[i]->logical_index)
-      minl = objs[i]->logical_index;
+static struct hwloc_internal_distances_s *
+hwloc__internal_distances_from_public(hwloc_topology_t topology, struct hwloc_distances_s *distances)
+{
+  struct hwloc_distances_container_s *cont = HWLOC_DISTANCES_CONTAINER(distances);
+  struct hwloc_internal_distances_s *dist;
+  for(dist = topology->first_dist; dist; dist = dist->next)
+    if (dist->id == cont->id)
+      return dist;
+  return NULL;
+}
 
-  /* compute/check min/max values */
-  for(i=0; i<nbobjs; i++)
-    for(j=0; j<nbobjs; j++) {
-      float val = osmatrix[i*nbobjs+j];
-      if (val < min)
-	min = val;
-      if (val > max)
-	max = val;
-    }
-  if (!min) {
-    /* Linux up to 2.6.36 reports ACPI SLIT distances, which should be memory latencies.
-     * Except of SGI IP27 (SGI Origin 200/2000 with MIPS processors) where the distances
-     * are the number of hops between routers.
-     */
-    hwloc_debug("%s", "minimal distance is 0, matrix does not seem to contain latencies, ignoring\n");
-    return;
-  }
+void
+hwloc_distances_release(hwloc_topology_t topology __hwloc_attribute_unused,
+			struct hwloc_distances_s *distances)
+{
+  struct hwloc_distances_container_s *cont = HWLOC_DISTANCES_CONTAINER(distances);
+  free(distances->values);
+  free(distances->objs);
+  free(cont);
+}
 
-  /* store the normalized latency matrix in the root object */
-  idx = root->distances_count++;
-  root->distances = realloc(root->distances, root->distances_count * sizeof(struct hwloc_distances_s *));
-  root->distances[idx] = malloc(sizeof(struct hwloc_distances_s));
-  root->distances[idx]->relative_depth = relative_depth;
-  root->distances[idx]->nbobjs = nbobjs;
-  root->distances[idx]->latency = matrix = malloc(nbobjs*nbobjs*sizeof(float));
-  root->distances[idx]->latency_base = (float) min;
-#define NORMALIZE_LATENCY(d) ((d)/(min))
-  root->distances[idx]->latency_max = NORMALIZE_LATENCY(max);
-  for(i=0; i<nbobjs; i++) {
-    li = objs[i]->logical_index - minl;
-    matrix[li*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+i]);
-    for(j=i+1; j<nbobjs; j++) {
-      lj = objs[j]->logical_index - minl;
-      matrix[li*nbobjs+lj] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+j]);
-      matrix[lj*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[j*nbobjs+i]);
-    }
-  }
+const char *
+hwloc_distances_get_name(hwloc_topology_t topology, struct hwloc_distances_s *distances)
+{
+  struct hwloc_internal_distances_s *dist = hwloc__internal_distances_from_public(topology, distances);
+  return dist ? dist->name : NULL;
 }
 
-/* convert internal distances into logically-ordered distances
- * that can be exposed in the API
- */
-void
-hwloc_distances_finalize_logical(struct hwloc_topology *topology)
+static struct hwloc_distances_s *
+hwloc_distances_get_one(hwloc_topology_t topology __hwloc_attribute_unused,
+			struct hwloc_internal_distances_s *dist)
 {
+  struct hwloc_distances_container_s *cont;
+  struct hwloc_distances_s *distances;
   unsigned nbobjs;
-  int depth;
-  struct hwloc_os_distances_s * osdist;
-  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
 
-    nbobjs = osdist->nbobjs;
-    if (!nbobjs)
+  cont = malloc(sizeof(*cont));
+  if (!cont)
+    return NULL;
+  distances = &cont->distances;
+
+  nbobjs = distances->nbobjs = dist->nbobjs;
+
+  distances->objs = malloc(nbobjs * sizeof(hwloc_obj_t));
+  if (!distances->objs)
+    goto out;
+  memcpy(distances->objs, dist->objs, nbobjs * sizeof(hwloc_obj_t));
+
+  distances->values = malloc(nbobjs * nbobjs * sizeof(*distances->values));
+  if (!distances->values)
+    goto out_with_objs;
+  memcpy(distances->values, dist->values, nbobjs*nbobjs*sizeof(*distances->values));
+
+  distances->kind = dist->kind;
+
+  cont->id = dist->id;
+  return distances;
+
+ out_with_objs:
+  free(distances->objs);
+ out:
+  free(cont);
+  return NULL;
+}
+
+static int
+hwloc__distances_get(hwloc_topology_t topology,
+		     const char *name, hwloc_obj_type_t type,
+		     unsigned *nrp, struct hwloc_distances_s **distancesp,
+		     unsigned long kind, unsigned long flags __hwloc_attribute_unused)
+{
+  struct hwloc_internal_distances_s *dist;
+  unsigned nr = 0, i;
+
+  /* We could return the internal arrays (as const),
+   * but it would require to prevent removing distances between get() and free().
+   * Not performance critical anyway.
+   */
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* we could refresh only the distances that match, but we won't have many distances anyway,
+   * so performance is totally negligible.
+   *
+   * This is also useful in multithreaded apps that modify the topology.
+   * They can call any valid hwloc_distances_get() to force a refresh after
+   * changing the topology, so that future concurrent get() won't cause
+   * concurrent refresh().
+   */
+  hwloc_internal_distances_refresh(topology);
+
+  for(dist = topology->first_dist; dist; dist = dist->next) {
+    unsigned long kind_from = kind & HWLOC_DISTANCES_KIND_FROM_ALL;
+    unsigned long kind_means = kind & HWLOC_DISTANCES_KIND_MEANS_ALL;
+
+    if (name && (!dist->name || strcmp(name, dist->name)))
       continue;
 
-    depth = hwloc_get_type_depth(topology, osdist->type);
-    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    if (type != HWLOC_OBJ_TYPE_NONE && type != dist->unique_type)
       continue;
 
-    if (osdist->objs) {
-      assert(osdist->distances);
-      hwloc_distances__finalize_logical(topology, nbobjs,
-					osdist->objs,
-					osdist->distances);
+    if (kind_from && !(kind_from & dist->kind))
+      continue;
+    if (kind_means && !(kind_means & dist->kind))
+      continue;
+
+    if (nr < *nrp) {
+      struct hwloc_distances_s *distances = hwloc_distances_get_one(topology, dist);
+      if (!distances)
+	goto error;
+      distancesp[nr] = distances;
     }
+    nr++;
   }
+
+  for(i=nr; i<*nrp; i++)
+    distancesp[i] = NULL;
+  *nrp = nr;
+  return 0;
+
+ error:
+  for(i=0; i<nr; i++)
+    hwloc_distances_release(topology, distancesp[i]);
+  return -1;
 }
 
-/***************************************************
- * Destroying logical distances attached to objects
- */
+int
+hwloc_distances_get(hwloc_topology_t topology,
+		    unsigned *nrp, struct hwloc_distances_s **distancesp,
+		    unsigned long kind, unsigned long flags)
+{
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
 
-/* destroy an object distances structure */
-void
-hwloc_clear_object_distances_one(struct hwloc_distances_s * distances)
+  return hwloc__distances_get(topology, NULL, HWLOC_OBJ_TYPE_NONE, nrp, distancesp, kind, flags);
+}
+
+int
+hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
+			     unsigned *nrp, struct hwloc_distances_s **distancesp,
+			     unsigned long kind, unsigned long flags)
+{
+  hwloc_obj_type_t type;
+
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* FIXME: passing the depth of a group level may return group distances at a different depth */
+  type = hwloc_get_depth_type(topology, depth);
+  if (type == (hwloc_obj_type_t)-1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc__distances_get(topology, NULL, type, nrp, distancesp, kind, flags);
+}
+
+int
+hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
+			    unsigned *nrp, struct hwloc_distances_s **distancesp,
+			    unsigned long flags)
 {
-  free(distances->latency);
-  free(distances);
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc__distances_get(topology, name, HWLOC_OBJ_TYPE_NONE, nrp, distancesp, HWLOC_DISTANCES_KIND_ALL, flags);
 }
 
-void
-hwloc_clear_object_distances(hwloc_obj_t obj)
+int
+hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
+			    unsigned *nrp, struct hwloc_distances_s **distancesp,
+			    unsigned long kind, unsigned long flags)
 {
-  unsigned i;
-  for (i=0; i<obj->distances_count; i++)
-    hwloc_clear_object_distances_one(obj->distances[i]);
-  free(obj->distances);
-  obj->distances = NULL;
-  obj->distances_count = 0;
+  if (flags || !topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc__distances_get(topology, NULL, type, nrp, distancesp, kind, flags);
 }
 
-/******************************************
+/******************************************************
  * Grouping objects according to distances
  */
 
 static void hwloc_report_user_distance_error(const char *msg, int line)
 {
-    static int reported = 0;
-
-    if (!reported && !hwloc_hide_errors()) {
-        fprintf(stderr, "****************************************************************************\n");
-        fprintf(stderr, "* hwloc %s has encountered what looks like an error from user-given distances.\n", HWLOC_VERSION);
-        fprintf(stderr, "*\n");
-        fprintf(stderr, "* %s\n", msg);
-        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
-        fprintf(stderr, "*\n");
-        fprintf(stderr, "* Please make sure that distances given through the interface or environment\n");
-        fprintf(stderr, "* variables do not contradict any other topology information.\n");
-        fprintf(stderr, "****************************************************************************\n");
-        reported = 1;
-    }
+  static int reported = 0;
+
+  if (!reported && !hwloc_hide_errors()) {
+    fprintf(stderr, "****************************************************************************\n");
+    fprintf(stderr, "* hwloc %s was given invalid distances by the user.\n", HWLOC_VERSION);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* %s\n", msg);
+    fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* Please make sure that distances given through the programming API\n");
+    fprintf(stderr, "* do not contradict any other topology information.\n");
+    fprintf(stderr, "* \n");
+    fprintf(stderr, "* hwloc will now ignore this invalid topology information and continue.\n");
+    fprintf(stderr, "****************************************************************************\n");
+    reported = 1;
+  }
 }
 
-static int hwloc_compare_distances(float a, float b, float accuracy)
+static int hwloc_compare_values(uint64_t a, uint64_t b, float accuracy)
 {
-  if (accuracy != 0.0 && fabsf(a-b) < a * accuracy)
+  if (accuracy != 0.0f && fabsf((float)a-(float)b) < (float)a * accuracy)
     return 0;
   return a < b ? -1 : a == b ? 0 : 1;
 }
 
 /*
- * Place objects in groups if they are in a transitive graph of minimal distances.
+ * Place objects in groups if they are in a transitive graph of minimal values.
  * Return how many groups were created, or 0 if some incomplete distance graphs were found.
  */
 static unsigned
 hwloc__find_groups_by_min_distance(unsigned nbobjs,
-				   float *_distances,
+				   uint64_t *_values,
 				   float accuracy,
 				   unsigned *groupids,
 				   int verbose)
 {
-  float min_distance = FLT_MAX;
+  uint64_t min_distance = UINT64_MAX;
   unsigned groupid = 1;
   unsigned i,j,k;
   unsigned skipped = 0;
 
-#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+#define VALUE(i, j) _values[(i) * nbobjs + (j)]
 
   memset(groupids, 0, nbobjs*sizeof(*groupids));
 
   /* find the minimal distance */
   for(i=0; i<nbobjs; i++)
     for(j=0; j<nbobjs; j++) /* check the entire matrix, it may not be perfectly symmetric depending on the accuracy */
-      if (i != j && DISTANCE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
-        min_distance = DISTANCE(i, j);
-  hwloc_debug("found minimal distance %f between objects\n", min_distance);
+      if (i != j && VALUE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
+        min_distance = VALUE(i, j);
+  hwloc_debug("  found minimal distance %llu between objects\n", (unsigned long long) min_distance);
 
-  if (min_distance == FLT_MAX)
+  if (min_distance == UINT64_MAX)
     return 0;
 
   /* build groups of objects connected with this distance */
   for(i=0; i<nbobjs; i++) {
     unsigned size;
-    int firstfound;
+    unsigned firstfound;
 
     /* if already grouped, skip */
     if (groupids[i])
@@ -694,24 +943,24 @@ hwloc__find_groups_by_min_distance(unsigned nbobjs,
     size = 1;
     firstfound = i;
 
-    while (firstfound != -1) {
+    while (firstfound != (unsigned)-1) {
       /* we added new objects to the group, the first one was firstfound.
        * rescan all connections from these new objects (starting at first found) to any other objects,
        * so as to find new objects minimally-connected by transivity.
        */
-      int newfirstfound = -1;
+      unsigned newfirstfound = (unsigned)-1;
       for(j=firstfound; j<nbobjs; j++)
 	if (groupids[j] == groupid)
 	  for(k=0; k<nbobjs; k++)
-              if (!groupids[k] && !hwloc_compare_distances(DISTANCE(j, k), min_distance, accuracy)) {
+              if (!groupids[k] && !hwloc_compare_values(VALUE(j, k), min_distance, accuracy)) {
 	      groupids[k] = groupid;
 	      size++;
-	      if (newfirstfound == -1)
+	      if (newfirstfound == (unsigned)-1)
 		newfirstfound = k;
 	      if (i == j)
-		hwloc_debug("object %u is minimally connected to %u\n", k, i);
+		hwloc_debug("  object %u is minimally connected to %u\n", k, i);
 	      else
-	        hwloc_debug("object %u is minimally connected to %u through %u\n", k, i, j);
+	        hwloc_debug("  object %u is minimally connected to %u through %u\n", k, i, j);
 	    }
       firstfound = newfirstfound;
     }
@@ -726,8 +975,8 @@ hwloc__find_groups_by_min_distance(unsigned nbobjs,
     /* valid this group */
     groupid++;
     if (verbose)
-      fprintf(stderr, "Found transitive graph with %u objects with minimal distance %f accuracy %f\n",
-	      size, min_distance, accuracy);
+      fprintf(stderr, " Found transitive graph with %u objects with minimal distance %llu accuracy %f\n",
+	      size, (unsigned long long) min_distance, accuracy);
   }
 
   if (groupid == 2 && !skipped)
@@ -740,23 +989,23 @@ hwloc__find_groups_by_min_distance(unsigned nbobjs,
 
 /* check that the matrix is ok */
 static int
-hwloc__check_grouping_matrix(unsigned nbobjs, float *_distances, float accuracy, int verbose)
+hwloc__check_grouping_matrix(unsigned nbobjs, uint64_t *_values, float accuracy, int verbose)
 {
   unsigned i,j;
   for(i=0; i<nbobjs; i++) {
     for(j=i+1; j<nbobjs; j++) {
       /* should be symmetric */
-      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(j, i), accuracy)) {
+      if (hwloc_compare_values(VALUE(i, j), VALUE(j, i), accuracy)) {
 	if (verbose)
-	  fprintf(stderr, "Distance matrix asymmetric ([%u,%u]=%f != [%u,%u]=%f), aborting\n",
-		  i, j, DISTANCE(i, j), j, i, DISTANCE(j, i));
+	  fprintf(stderr, " Distance matrix asymmetric ([%u,%u]=%llu != [%u,%u]=%llu), aborting\n",
+		  i, j, (unsigned long long) VALUE(i, j), j, i, (unsigned long long) VALUE(j, i));
 	return -1;
       }
       /* diagonal is smaller than everything else */
-      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(i, i), accuracy) <= 0) {
+      if (hwloc_compare_values(VALUE(i, j), VALUE(i, i), accuracy) <= 0) {
 	if (verbose)
-	  fprintf(stderr, "Distance to self not strictly minimal ([%u,%u]=%f <= [%u,%u]=%f), aborting\n",
-		  i, j, DISTANCE(i, j), i, i, DISTANCE(i, i));
+	  fprintf(stderr, " Distance to self not strictly minimal ([%u,%u]=%llu <= [%u,%u]=%llu), aborting\n",
+		  i, j, (unsigned long long) VALUE(i, j), i, i, (unsigned long long) VALUE(i, i));
 	return -1;
       }
     }
@@ -771,61 +1020,61 @@ static void
 hwloc__groups_by_distances(struct hwloc_topology *topology,
 			   unsigned nbobjs,
 			   struct hwloc_obj **objs,
-			   float *_distances,
-			   unsigned nbaccuracies, float *accuracies,
-			   int fromuser,
-			   int needcheck,
-			   int verbose)
+			   uint64_t *_values,
+			   unsigned long kind,
+			   unsigned nbaccuracies,
+			   float *accuracies,
+			   int needcheck)
 {
-  unsigned *groupids = NULL;
+  unsigned *groupids;
   unsigned nbgroups = 0;
   unsigned i,j;
+  int verbose = topology->grouping_verbose;
+  hwloc_obj_t *groupobjs;
+  unsigned * groupsizes;
+  uint64_t *groupvalues;
+  unsigned failed = 0;
 
-  if (nbobjs <= 2) {
+  if (nbobjs <= 2)
       return;
-  }
 
-  groupids = malloc(sizeof(unsigned) * nbobjs);
-  if (NULL == groupids) {
-      return;
-  }
+  if (!(kind & HWLOC_DISTANCES_KIND_MEANS_LATENCY))
+    /* don't know use to use those for grouping */
+    /* TODO hwloc__find_groups_by_max_distance() for bandwidth */
+    return;
+
+  groupids = malloc(nbobjs * sizeof(*groupids));
+  if (!groupids)
+    return;
 
   for(i=0; i<nbaccuracies; i++) {
     if (verbose)
       fprintf(stderr, "Trying to group %u %s objects according to physical distances with accuracy %f\n",
 	      nbobjs, hwloc_obj_type_string(objs[0]->type), accuracies[i]);
-    if (needcheck && hwloc__check_grouping_matrix(nbobjs, _distances, accuracies[i], verbose) < 0)
+    if (needcheck && hwloc__check_grouping_matrix(nbobjs, _values, accuracies[i], verbose) < 0)
       continue;
-    nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _distances, accuracies[i], groupids, verbose);
+    nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _values, accuracies[i], groupids, verbose);
     if (nbgroups)
       break;
   }
   if (!nbgroups)
-    goto outter_free;
-
-  /* For convenience, put these declarations inside a block.  It's a
-     crying shame we can't use C99 syntax here, and have to do a bunch
-     of mallocs. :-( */
-  {
-      hwloc_obj_t *groupobjs = NULL;
-      unsigned *groupsizes = NULL;
-      float *groupdistances = NULL;
-      unsigned failed = 0;
-
-      groupobjs = malloc(sizeof(hwloc_obj_t) * nbgroups);
-      groupsizes = malloc(sizeof(unsigned) * nbgroups);
-      groupdistances = malloc(sizeof(float) * nbgroups * nbgroups);
-      if (NULL == groupobjs || NULL == groupsizes || NULL == groupdistances) {
-          goto inner_free;
-      }
+    goto out_with_groupids;
+
+  groupobjs = malloc(nbgroups * sizeof(*groupobjs));
+  groupsizes = malloc(nbgroups * sizeof(*groupsizes));
+  groupvalues = malloc(nbgroups * nbgroups * sizeof(*groupvalues));
+  if (!groupobjs || !groupsizes || !groupvalues)
+    goto out_with_groups;
+
       /* create new Group objects and record their size */
       memset(&(groupsizes[0]), 0, sizeof(groupsizes[0]) * nbgroups);
       for(i=0; i<nbgroups; i++) {
           /* create the Group object */
           hwloc_obj_t group_obj, res_obj;
-          group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+          group_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
           group_obj->cpuset = hwloc_bitmap_alloc();
-          group_obj->attr->group.depth = topology->next_group_depth;
+          group_obj->attr->group.kind = HWLOC_GROUP_KIND_DISTANCE;
+          group_obj->attr->group.subkind = topology->grouping_next_subkind;
           for (j=0; j<nbobjs; j++)
 	    if (groupids[j] == i+1) {
 	      /* assemble the group sets */
@@ -834,34 +1083,34 @@ hwloc__groups_by_distances(struct hwloc_topology *topology,
             }
           hwloc_debug_1arg_bitmap("adding Group object with %u objects and cpuset %s\n",
                                   groupsizes[i], group_obj->cpuset);
-          res_obj = hwloc__insert_object_by_cpuset(topology, group_obj,
-						   fromuser ? hwloc_report_user_distance_error : hwloc_report_os_error);
+          res_obj = hwloc__insert_object_by_cpuset(topology, NULL, group_obj,
+						   (kind & HWLOC_DISTANCES_KIND_FROM_USER) ? hwloc_report_user_distance_error : hwloc_report_os_error);
 	  /* res_obj may be NULL on failure to insert. */
 	  if (!res_obj)
 	    failed++;
 	  /* or it may be different from groupobjs if we got groups from XML import before grouping */
           groupobjs[i] = res_obj;
       }
+      topology->grouping_next_subkind++;
 
       if (failed)
 	/* don't try to group above if we got a NULL group here, just keep this incomplete level */
-	goto inner_free;
+	goto out_with_groups;
 
-      /* factorize distances */
-      memset(&(groupdistances[0]), 0, sizeof(groupdistances[0]) * nbgroups * nbgroups);
-#undef DISTANCE
-#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
-#define GROUP_DISTANCE(i, j) groupdistances[(i) * nbgroups + (j)]
+      /* factorize values */
+      memset(&(groupvalues[0]), 0, sizeof(groupvalues[0]) * nbgroups * nbgroups);
+#undef VALUE
+#define VALUE(i, j) _values[(i) * nbobjs + (j)]
+#define GROUP_VALUE(i, j) groupvalues[(i) * nbgroups + (j)]
       for(i=0; i<nbobjs; i++)
 	if (groupids[i])
 	  for(j=0; j<nbobjs; j++)
 	    if (groupids[j])
-                GROUP_DISTANCE(groupids[i]-1, groupids[j]-1) += DISTANCE(i, j);
+                GROUP_VALUE(groupids[i]-1, groupids[j]-1) += VALUE(i, j);
       for(i=0; i<nbgroups; i++)
           for(j=0; j<nbgroups; j++) {
               unsigned groupsize = groupsizes[i]*groupsizes[j];
-              float groupsizef = (float) groupsize;
-              GROUP_DISTANCE(i, j) /= groupsizef;
+              GROUP_VALUE(i, j) /= groupsize;
           }
 #ifdef HWLOC_DEBUG
       hwloc_debug("%s", "generated new distance matrix between groups:\n");
@@ -872,124 +1121,17 @@ hwloc__groups_by_distances(struct hwloc_topology *topology,
       for(i=0; i<nbgroups; i++) {
 	hwloc_debug("  % 5d", (int) i);
 	for(j=0; j<nbgroups; j++)
-	  hwloc_debug(" %2.3f", GROUP_DISTANCE(i, j));
-	hwloc_debug("%s", "\n");
-      }
-#endif
-
-      topology->next_group_depth++;
-      hwloc__groups_by_distances(topology, nbgroups, groupobjs, (float*) groupdistances, nbaccuracies, accuracies, fromuser, 0 /* no need to check generated matrix */, verbose);
-
-  inner_free:
-      /* Safely free everything */
-      if (NULL != groupobjs) {
-          free(groupobjs);
-      }
-      if (NULL != groupsizes) {
-          free(groupsizes);
-      }
-      if (NULL != groupdistances) {
-          free(groupdistances);
-      }
-  }
-
- outter_free:
-  if (NULL != groupids) {
-      free(groupids);
-  }
-}
-
-void
-hwloc_group_by_distances(struct hwloc_topology *topology)
-{
-  unsigned nbobjs;
-  struct hwloc_os_distances_s * osdist;
-  const char *env;
-  float accuracies[5] = { 0.0f, 0.01f, 0.02f, 0.05f, 0.1f };
-  unsigned nbaccuracies = 5;
-  hwloc_obj_t group_obj;
-  int verbose = 0;
-  unsigned i;
-  hwloc_localeswitch_declare;
-#ifdef HWLOC_DEBUG
-  unsigned j;
-#endif
-
-  env = getenv("HWLOC_GROUPING");
-  if (env && !atoi(env))
-    return;
-  /* backward compat with v1.2 */
-  if (getenv("HWLOC_IGNORE_DISTANCES"))
-    return;
-
-  hwloc_localeswitch_init();
-  env = getenv("HWLOC_GROUPING_ACCURACY");
-  if (!env) {
-    /* only use 0.0 */
-    nbaccuracies = 1;
-  } else if (strcmp(env, "try")) {
-    /* use the given value */
-    nbaccuracies = 1;
-    accuracies[0] = (float) atof(env);
-  } /* otherwise try all values */
-  hwloc_localeswitch_fini();
-
-#ifdef HWLOC_DEBUG
-  verbose = 1;
-#else
-  env = getenv("HWLOC_GROUPING_VERBOSE");
-  if (env)
-    verbose = atoi(env);
-#endif
-
-  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
-
-    nbobjs = osdist->nbobjs;
-    if (!nbobjs)
-      continue;
-
-    if (osdist->objs) {
-      /* if we have objs, we must have distances as well,
-       * thanks to hwloc_convert_distances_indexes_into_objects()
-       */
-      assert(osdist->distances);
-
-#ifdef HWLOC_DEBUG
-      hwloc_debug("%s", "trying to group objects using distance matrix:\n");
-      hwloc_debug("%s", "  index");
-      for(j=0; j<nbobjs; j++)
-	hwloc_debug(" % 5d", (int) osdist->objs[j]->os_index);
-      hwloc_debug("%s", "\n");
-      for(i=0; i<nbobjs; i++) {
-	hwloc_debug("  % 5d", (int) osdist->objs[i]->os_index);
-	for(j=0; j<nbobjs; j++)
-	  hwloc_debug(" %2.3f", osdist->distances[i*nbobjs + j]);
+	  hwloc_debug(" %llu", (unsigned long long) GROUP_VALUE(i, j));
 	hwloc_debug("%s", "\n");
       }
 #endif
 
-      hwloc__groups_by_distances(topology, nbobjs,
-				 osdist->objs,
-				 osdist->distances,
-				 nbaccuracies, accuracies,
-				 osdist->indexes != NULL,
-				 1 /* check the first matrice */,
-				 verbose);
+      hwloc__groups_by_distances(topology, nbgroups, groupobjs, groupvalues, kind, nbaccuracies, accuracies, 0 /* no need to check generated matrix */);
 
-      /* add a final group object covering everybody so that the distance matrix can be stored somewhere.
-       * this group will be merged into a regular object if the matrix isn't strangely incomplete
-       */
-      group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
-      group_obj->attr->group.depth = (unsigned) -1;
-      group_obj->cpuset = hwloc_bitmap_alloc();
-      for(i=0; i<nbobjs; i++) {
-	/* assemble the group sets */
-	hwloc_obj_add_other_obj_sets(group_obj, osdist->objs[i]);
-      }
-      hwloc_debug_1arg_bitmap("adding Group object (as root of distance matrix with %u objects) with cpuset %s\n",
-			      nbobjs, group_obj->cpuset);
-      hwloc__insert_object_by_cpuset(topology, group_obj,
-				     osdist->indexes != NULL ? hwloc_report_user_distance_error : hwloc_report_os_error);
-    }
-  }
+ out_with_groups:
+  free(groupobjs);
+  free(groupsizes);
+  free(groupvalues);
+ out_with_groupids:
+  free(groupids);
 }
diff --git a/ext/hwloc/hwloc/misc.c b/ext/hwloc/hwloc/misc.c
index 3da6687d4..a7b9a5ebd 100644
--- a/ext/hwloc/hwloc/misc.c
+++ b/ext/hwloc/hwloc/misc.c
@@ -1,14 +1,14 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2010 Université Bordeaux
- * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2009-2018 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <private/private.h>
-#include <private/misc.h>
+#include "private/autogen/config.h"
+#include "private/private.h"
+#include "private/misc.h"
 
 #include <stdarg.h>
 #ifdef HAVE_SYS_UTSNAME_H
@@ -28,6 +28,7 @@ extern char *program_invocation_name;
 extern char *__progname;
 #endif
 
+#ifndef HWLOC_HAVE_CORRECT_SNPRINTF
 int hwloc_snprintf(char *str, size_t size, const char *format, ...)
 {
   int ret;
@@ -77,21 +78,7 @@ int hwloc_snprintf(char *str, size_t size, const char *format, ...)
 
   return ret;
 }
-
-int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n)
-{
-  size_t i = 0;
-  while (*haystack && *haystack != ':') {
-    int ha = *haystack++;
-    int low_h = tolower(ha);
-    int ne = *needle++;
-    int low_n = tolower(ne);
-    if (low_h != low_n)
-      return 1;
-    i++;
-  }
-  return i < n;
-}
+#endif
 
 void hwloc_add_uname_info(struct hwloc_topology *topology __hwloc_attribute_unused,
 			  void *cached_uname __hwloc_attribute_unused)
@@ -128,18 +115,18 @@ char *
 hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
 {
 #if HAVE_DECL_GETMODULEFILENAME
-  char name[256], *basename;
+  char name[256], *local_basename;
   unsigned res = GetModuleFileName(NULL, name, sizeof(name));
   if (res == sizeof(name) || !res)
     return NULL;
-  basename = strrchr(name, '\\');
-  if (!basename)
-    basename = name;
+  local_basename = strrchr(name, '\\');
+  if (!local_basename)
+    local_basename = name;
   else
-    basename++;
-  return strdup(basename);
+    local_basename++;
+  return strdup(local_basename);
 #else /* !HAVE_GETMODULEFILENAME */
-  const char *name, *basename;
+  const char *name, *local_basename;
 #if HAVE_DECL_GETPROGNAME
   name = getprogname(); /* FreeBSD, NetBSD, some Solaris */
 #elif HAVE_DECL_GETEXECNAME
@@ -151,16 +138,16 @@ hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
   name = __progname; /* fallback for most unix, used for OpenBSD */
 #else
   /* TODO: _NSGetExecutablePath(path, &size) on Darwin */
-  /* TODO: AIX, HPUX, OSF */
+  /* TODO: AIX, HPUX */
   name = NULL;
 #endif
   if (!name)
     return NULL;
-  basename = strrchr(name, '/');
-  if (!basename)
-    basename = name;
+  local_basename = strrchr(name, '/');
+  if (!local_basename)
+    local_basename = name;
   else
-    basename++;
-  return strdup(basename);
+    local_basename++;
+  return strdup(local_basename);
 #endif /* !HAVE_GETMODULEFILENAME */
 }
diff --git a/ext/hwloc/hwloc/pci-common.c b/ext/hwloc/hwloc/pci-common.c
index 1000ca1b1..deca5cce5 100644
--- a/ext/hwloc/hwloc/pci-common.c
+++ b/ext/hwloc/hwloc/pci-common.c
@@ -1,12 +1,186 @@
 /*
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <hwloc/plugins.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "hwloc/plugins.h"
+#include "private/private.h"
+#include "private/debug.h"
+#include "private/misc.h"
+
+#include <fcntl.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+
+#if defined(HWLOC_WIN_SYS) && !defined(__CYGWIN__)
+#include <io.h>
+#define open _open
+#define read _read
+#define close _close
+#endif
+
+
+/**************************************
+ * Init/Exit and Forced PCI localities
+ */
+
+static void
+hwloc_pci_forced_locality_parse_one(struct hwloc_topology *topology,
+				    const char *string /* must contain a ' ' */,
+				    unsigned *allocated)
+{
+  unsigned nr = topology->pci_forced_locality_nr;
+  unsigned domain, bus_first, bus_last, dummy;
+  hwloc_bitmap_t set;
+  char *tmp;
+
+  if (sscanf(string, "%x:%x-%x %x", &domain, &bus_first, &bus_last, &dummy) == 4) {
+    /* fine */
+  } else if (sscanf(string, "%x:%x %x", &domain, &bus_first, &dummy) == 3) {
+    bus_last = bus_first;
+  } else if (sscanf(string, "%x %x", &domain, &dummy) == 2) {
+    bus_first = 0;
+    bus_last = 255;
+  } else
+    return;
+
+  tmp = strchr(string, ' ');
+  if (!tmp)
+    return;
+  tmp++;
+
+  set = hwloc_bitmap_alloc();
+  hwloc_bitmap_sscanf(set, tmp);
+
+  if (!*allocated) {
+    topology->pci_forced_locality = malloc(sizeof(*topology->pci_forced_locality));
+    if (!topology->pci_forced_locality)
+      goto out_with_set; /* failed to allocate, ignore this forced locality */
+    *allocated = 1;
+  } else if (nr >= *allocated) {
+    struct hwloc_pci_forced_locality_s *tmplocs;
+    tmplocs = realloc(topology->pci_forced_locality,
+		      2 * *allocated * sizeof(*topology->pci_forced_locality));
+    if (!tmplocs)
+      goto out_with_set; /* failed to allocate, ignore this forced locality */
+    topology->pci_forced_locality = tmplocs;
+    *allocated *= 2;
+  }
+
+  topology->pci_forced_locality[nr].domain = domain;
+  topology->pci_forced_locality[nr].bus_first = bus_first;
+  topology->pci_forced_locality[nr].bus_last = bus_last;
+  topology->pci_forced_locality[nr].cpuset = set;
+  topology->pci_forced_locality_nr++;
+  return;
+
+ out_with_set:
+  hwloc_bitmap_free(set);
+  return;
+}
+
+static void
+hwloc_pci_forced_locality_parse(struct hwloc_topology *topology, const char *_env)
+{
+  char *env = strdup(_env);
+  unsigned allocated = 0;
+  char *tmp = env;
+
+  while (1) {
+    size_t len = strcspn(tmp, ";\r\n");
+    char *next = NULL;
+
+    if (tmp[len] != '\0') {
+      tmp[len] = '\0';
+      if (tmp[len+1] != '\0')
+	next = &tmp[len]+1;
+    }
+
+    hwloc_pci_forced_locality_parse_one(topology, tmp, &allocated);
+
+    if (next)
+      tmp = next;
+    else
+      break;
+  }
+
+  free(env);
+}
+
+void
+hwloc_pci_discovery_init(struct hwloc_topology *topology)
+{
+  topology->pci_has_forced_locality = 0;
+  topology->pci_forced_locality_nr = 0;
+  topology->pci_forced_locality = NULL;
+
+  topology->first_pci_locality = topology->last_pci_locality = NULL;
+}
+
+void
+hwloc_pci_discovery_prepare(struct hwloc_topology *topology)
+{
+  char *env;
+
+  env = getenv("HWLOC_PCI_LOCALITY");
+  if (env) {
+    int fd;
+
+    topology->pci_has_forced_locality = 1;
+
+    fd = open(env, O_RDONLY);
+    if (fd >= 0) {
+      struct stat st;
+      char *buffer;
+      int err = fstat(fd, &st);
+      if (!err) {
+	if (st.st_size <= 64*1024) { /* random limit large enough to store multiple cpusets for thousands of PUs */
+	  buffer = malloc(st.st_size+1);
+	  if (buffer && read(fd, buffer, st.st_size) == st.st_size) {
+	    buffer[st.st_size] = '\0';
+	    hwloc_pci_forced_locality_parse(topology, buffer);
+	  }
+	  free(buffer);
+	} else {
+	  fprintf(stderr, "Ignoring HWLOC_PCI_LOCALITY file `%s' too large (%lu bytes)\n",
+		  env, (unsigned long) st.st_size);
+	}
+      }
+      close(fd);
+    } else
+      hwloc_pci_forced_locality_parse(topology, env);
+  }
+}
+
+void
+hwloc_pci_discovery_exit(struct hwloc_topology *topology)
+{
+  struct hwloc_pci_locality_s *cur;
+  unsigned i;
+
+  for(i=0; i<topology->pci_forced_locality_nr; i++)
+    hwloc_bitmap_free(topology->pci_forced_locality[i].cpuset);
+  free(topology->pci_forced_locality);
+
+  cur = topology->first_pci_locality;
+  while (cur) {
+    struct hwloc_pci_locality_s *next = cur->next;
+    hwloc_bitmap_free(cur->cpuset);
+    free(cur);
+    cur = next;
+  }
+
+  hwloc_pci_discovery_init(topology);
+}
+
+
+/******************************
+ * Inserting in Tree by Bus ID
+ */
 
 #ifdef HWLOC_DEBUG
 static void
@@ -30,7 +204,7 @@ hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
     if (pcidev->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
       hwloc_debug("HostBridge");
     else
-      hwloc_debug("Bridge [%04x:%04x]", busid,
+      hwloc_debug("%s Bridge [%04x:%04x]", busid,
 		  pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id);
     hwloc_debug(" to %04x:[%02x:%02x]\n",
 		pcidev->attr->bridge.downstream.pci.domain, pcidev->attr->bridge.downstream.pci.secondary_bus, pcidev->attr->bridge.downstream.pci.subordinate_bus);
@@ -40,39 +214,19 @@ hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
 		pcidev->attr->pcidev.subvendor_id, pcidev->attr->pcidev.subdevice_id,
 		pcidev->attr->pcidev.revision, pcidev->attr->pcidev.class_id);
 }
-#endif /* HWLOC_DEBUG */
 
 static void
-hwloc_pci_traverse_lookuposdevices_cb(void * cbdata,
-				      struct hwloc_obj *pcidev)
-{
-  struct hwloc_backend *backend = cbdata;
-
-  if (pcidev->type == HWLOC_OBJ_BRIDGE)
-    return;
-
-  hwloc_backends_notify_new_object(backend, pcidev);
-}
-
-static void
-hwloc_pci__traverse(void * cbdata, struct hwloc_obj *root,
-		    void (*cb)(void * cbdata, struct hwloc_obj *))
+hwloc_pci_traverse(void * cbdata, struct hwloc_obj *tree,
+		   void (*cb)(void * cbdata, struct hwloc_obj *))
 {
-  struct hwloc_obj *child = root->io_first_child;
-  while (child) {
-    cb(cbdata, child);
+  hwloc_obj_t child;
+  cb(cbdata, tree);
+  for_each_io_child(child, tree) {
     if (child->type == HWLOC_OBJ_BRIDGE)
-      hwloc_pci__traverse(cbdata, child, cb);
-    child = child->next_sibling;
+      hwloc_pci_traverse(cbdata, child, cb);
   }
 }
-
-static void
-hwloc_pci_traverse(void * cbdata, struct hwloc_obj *root,
-		   void (*cb)(void * cbdata, struct hwloc_obj *))
-{
-  hwloc_pci__traverse(cbdata, root, cb);
-}
+#endif /* HWLOC_DEBUG */
 
 enum hwloc_pci_busid_comparison_e {
   HWLOC_PCI_BUSID_LOWER,
@@ -84,10 +238,12 @@ enum hwloc_pci_busid_comparison_e {
 static enum hwloc_pci_busid_comparison_e
 hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
 {
+#ifdef HWLOC_DEBUG
   if (a->type == HWLOC_OBJ_BRIDGE)
     assert(a->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
   if (b->type == HWLOC_OBJ_BRIDGE)
     assert(b->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+#endif
 
   if (a->attr->pcidev.domain < b->attr->pcidev.domain)
     return HWLOC_PCI_BUSID_LOWER;
@@ -126,11 +282,11 @@ hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
 }
 
 static void
-hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
+hwloc_pci_add_object(struct hwloc_obj *parent, struct hwloc_obj **parent_io_first_child_p, struct hwloc_obj *new)
 {
   struct hwloc_obj **curp, **childp;
 
-  curp = &root->io_first_child;
+  curp = parent_io_first_child_p;
   while (*curp) {
     enum hwloc_pci_busid_comparison_e comp = hwloc_pci_compare_busids(new, *curp);
     switch (comp) {
@@ -140,27 +296,31 @@ hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
       continue;
     case HWLOC_PCI_BUSID_INCLUDED:
       /* insert new below current bridge */
-      hwloc_pci_add_object(*curp, new);
+      hwloc_pci_add_object(*curp, &(*curp)->io_first_child, new);
       return;
     case HWLOC_PCI_BUSID_LOWER:
     case HWLOC_PCI_BUSID_SUPERSET: {
       /* insert new before current */
       new->next_sibling = *curp;
       *curp = new;
-      new->parent = root;
+      new->parent = parent;
       if (new->type == HWLOC_OBJ_BRIDGE) {
 	/* look at remaining siblings and move some below new */
 	childp = &new->io_first_child;
 	curp = &new->next_sibling;
 	while (*curp) {
-	  if (hwloc_pci_compare_busids(new, *curp) == HWLOC_PCI_BUSID_LOWER) {
-	    /* this sibling remains under root, after new */
-	    curp = &(*curp)->next_sibling;
-	    /* even if the list is sorted by busid, we can't break because the current bridge creates a bus that may be higher. some object may have to go there */
+	  hwloc_obj_t cur = *curp;
+	  if (hwloc_pci_compare_busids(new, cur) == HWLOC_PCI_BUSID_LOWER) {
+	    /* this sibling remains under root, after new. */
+	    if (cur->attr->pcidev.domain > new->attr->pcidev.domain
+		|| cur->attr->pcidev.bus > new->attr->bridge.downstream.pci.subordinate_bus)
+	      /* this sibling is even above new's subordinate bus, no other sibling could go below new */
+	      return;
+	    curp = &cur->next_sibling;
 	  } else {
 	    /* this sibling goes under new */
-	    *childp = *curp;
-	    *curp = (*curp)->next_sibling;
+	    *childp = cur;
+	    *curp = cur->next_sibling;
 	    (*childp)->parent = new;
 	    (*childp)->next_sibling = NULL;
 	    childp = &(*childp)->next_sibling;
@@ -172,15 +332,101 @@ hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
     }
   }
   /* add to the end of the list if higher than everybody */
-  new->parent = root;
+  new->parent = parent;
   new->next_sibling = NULL;
   *curp = new;
 }
 
+void
+hwloc_pcidisc_tree_insert_by_busid(struct hwloc_obj **treep,
+				   struct hwloc_obj *obj)
+{
+  hwloc_pci_add_object(NULL /* no parent on top of tree */, treep, obj);
+}
+
+
+/**********************
+ * Attaching PCI Trees
+ */
+
 static struct hwloc_obj *
-hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
-				  struct hwloc_obj *hostbridge,
-				  struct hwloc_obj *parent)
+hwloc_pcidisc_add_hostbridges(struct hwloc_topology *topology,
+			      struct hwloc_obj *old_tree)
+{
+  struct hwloc_obj * new = NULL, **newp = &new;
+
+  /*
+   * tree points to all objects connected to any upstream bus in the machine.
+   * We now create one real hostbridge object per upstream bus.
+   * It's not actually a PCI device so we have to create it.
+   */
+  while (old_tree) {
+    /* start a new host bridge */
+    struct hwloc_obj *hostbridge;
+    struct hwloc_obj **dstnextp;
+    struct hwloc_obj **srcnextp;
+    struct hwloc_obj *child;
+    unsigned short current_domain;
+    unsigned char current_bus;
+    unsigned char current_subordinate;
+
+    hostbridge = hwloc_alloc_setup_object(topology, HWLOC_OBJ_BRIDGE, HWLOC_UNKNOWN_INDEX);
+    if (!hostbridge) {
+      /* just queue remaining things without hostbridges and return */
+      *newp = old_tree;
+      return new;
+    }
+    dstnextp = &hostbridge->io_first_child;
+
+    srcnextp = &old_tree;
+    child = *srcnextp;
+    current_domain = child->attr->pcidev.domain;
+    current_bus = child->attr->pcidev.bus;
+    current_subordinate = current_bus;
+
+    hwloc_debug("Adding new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+
+  next_child:
+    /* remove next child from tree */
+    *srcnextp = child->next_sibling;
+    /* append it to hostbridge */
+    *dstnextp = child;
+    child->parent = hostbridge;
+    child->next_sibling = NULL;
+    dstnextp = &child->next_sibling;
+
+    /* compute hostbridge secondary/subordinate buses */
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
+      current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+
+    /* use next child if it has the same domains/bus */
+    child = *srcnextp;
+    if (child
+	&& child->attr->pcidev.domain == current_domain
+	&& child->attr->pcidev.bus == current_bus)
+      goto next_child;
+
+    /* finish setting up this hostbridge */
+    hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
+    hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+    hostbridge->attr->bridge.downstream.pci.domain = current_domain;
+    hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
+    hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
+    hwloc_debug("  new PCI hostbridge covers %04x:[%02x-%02x]\n",
+		current_domain, current_bus, current_subordinate);
+
+    *newp = hostbridge;
+    newp = &hostbridge->next_sibling;
+  }
+
+  return new;
+}
+
+static struct hwloc_obj *
+hwloc_pci_fixup_busid_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
+			     struct hwloc_pcidev_attr_s *busid,
+			     struct hwloc_obj *parent)
 {
   /* Xeon E5v3 in cluster-on-die mode only have PCI on the first NUMA node of each package.
    * but many dual-processor host report the second PCI hierarchy on 2nd NUMA of first package.
@@ -196,13 +442,13 @@ hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attrib
 	fprintf(stderr, "****************************************************************************\n");
 	fprintf(stderr, "* hwloc %s has encountered an incorrect PCI locality information.\n", HWLOC_VERSION);
 	fprintf(stderr, "* PCI bus %04x:%02x is supposedly close to 2nd NUMA node of 1st package,\n",
-		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+		busid->domain, busid->bus);
 	fprintf(stderr, "* however hwloc believes this is impossible on this architecture.\n");
 	fprintf(stderr, "* Therefore the PCI bus will be moved to 1st NUMA node of 2nd package.\n");
 	fprintf(stderr, "*\n");
 	fprintf(stderr, "* If you feel this fixup is wrong, disable it by setting in your environment\n");
 	fprintf(stderr, "* HWLOC_PCI_%04x_%02x_LOCALCPUS= (empty value), and report the problem\n",
-		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+		busid->domain, busid->bus);
 	fprintf(stderr, "* to the hwloc's user mailing list together with the XML output of lstopo.\n");
 	fprintf(stderr, "*\n");
 	fprintf(stderr, "* You may silence this message by setting HWLOC_HIDE_ERRORS=1 in your environment.\n");
@@ -216,175 +462,310 @@ hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attrib
 }
 
 static struct hwloc_obj *
-hwloc_pci_find_hostbridge_parent(struct hwloc_topology *topology, struct hwloc_backend *backend,
-				 struct hwloc_obj *hostbridge)
+hwloc__pci_find_busid_parent(struct hwloc_topology *topology, struct hwloc_pcidev_attr_s *busid)
 {
   hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
-  hwloc_obj_t group_obj, parent;
-  const char *env;
+  hwloc_obj_t parent;
+  int forced = 0;
+  int noquirks = 0;
+  unsigned i;
   int err;
 
-  /* override the cpuset with the environment if given */
-  int forced = 0;
-  char envname[256];
-  snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
-	   hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
-  env = getenv(envname);
-  if (env)
-    /* if env exists but is empty, don't let quirks change what the OS reports */
-    forced = 1;
-  if (env && *env) {
-    /* force the hostbridge cpuset */
-    hwloc_debug("Overriding localcpus using %s in the environment\n", envname);
-    hwloc_bitmap_sscanf(cpuset, env);
-  } else {
-    /* get the hostbridge cpuset by acking the OS backend.
-     * it's not a PCI device, so we use its first child locality info.
-     */
-    err = hwloc_backends_get_obj_cpuset(backend, hostbridge->io_first_child, cpuset);
+  hwloc_debug("Looking for parent of PCI busid %04x:%02x:%02x.%01x\n",
+	      busid->domain, busid->bus, busid->dev, busid->func);
+
+  /* try to match a forced locality */
+  if (topology->pci_has_forced_locality) {
+    for(i=0; i<topology->pci_forced_locality_nr; i++) {
+      if (busid->domain == topology->pci_forced_locality[i].domain
+	  && busid->bus >= topology->pci_forced_locality[i].bus_first
+	  && busid->bus <= topology->pci_forced_locality[i].bus_last) {
+	hwloc_bitmap_copy(cpuset, topology->pci_forced_locality[i].cpuset);
+	forced = 1;
+	break;
+      }
+    }
+    /* if pci locality was forced, even empty, don't let quirks change what the OS reports */
+    noquirks = 1;
+  }
+
+  /* deprecated force locality variables */
+  if (!forced) {
+    const char *env;
+    char envname[256];
+    /* override the cpuset with the environment if given */
+    snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
+	     busid->domain, busid->bus);
+    env = getenv(envname);
+    if (env) {
+      static int reported = 0;
+      if (!topology->pci_has_forced_locality && !reported) {
+	fprintf(stderr, "Environment variable %s is deprecated, please use HWLOC_PCI_LOCALITY instead.\n", env);
+	reported = 1;
+      }
+      if (*env) {
+	/* force the cpuset */
+	hwloc_debug("Overriding PCI locality using %s in the environment\n", envname);
+	hwloc_bitmap_sscanf(cpuset, env);
+	forced = 1;
+      }
+      /* if env exists, even empty, don't let quirks change what the OS reports */
+      noquirks = 1;
+    }
+  }
+
+  if (!forced) {
+    /* get the cpuset by asking the backend that provides the relevant hook, if any. */
+    struct hwloc_backend *backend = topology->get_pci_busid_cpuset_backend;
+    if (backend)
+      err = backend->get_pci_busid_cpuset(backend, busid, cpuset);
+    else
+      err = -1;
     if (err < 0)
-      /* if we got nothing, assume the hostbridge is attached to the top of hierarchy */
+      /* if we got nothing, assume this PCI bus is attached to the top of hierarchy */
       hwloc_bitmap_copy(cpuset, hwloc_topology_get_topology_cpuset(topology));
   }
 
-  hwloc_debug_bitmap("Attaching hostbridge to cpuset %s\n", cpuset);
-
-  /* restrict to the existing complete cpuset to avoid errors later */
-  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
-
-  /* if the remaining cpuset is empty, take the root */
-  if (hwloc_bitmap_iszero(cpuset))
-    hwloc_bitmap_copy(cpuset, hwloc_topology_get_complete_cpuset(topology));
-
-  group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
-  if (group_obj) {
-    group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
-    hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
-    group_obj->cpuset = hwloc_bitmap_dup(cpuset);
-    group_obj->attr->group.depth = (unsigned) -1;
-    parent = hwloc__insert_object_by_cpuset(topology, group_obj, hwloc_report_os_error);
-    if (parent == group_obj) {
-      /* group inserted without being merged, setup its sets */
-      hwloc_obj_add_children_sets(group_obj);
-    } else if (!parent) {
-      /* Failed to insert the parent, maybe a conflicting cpuset, attach to the root object instead */
-      parent = hwloc_get_root_obj(topology);
-    } else {
-      /* Got merged. This object has the right cpuset, but it could be a cache or so,
-       * go up as long as the (complete)cpuset is the same.
-       */
-      while (parent->parent) {
-	if (parent->complete_cpuset && parent->parent->complete_cpuset) {
-	  if (!hwloc_bitmap_isequal(parent->complete_cpuset, parent->parent->complete_cpuset))
-	    break;
-	} else {
-	  if (!hwloc_bitmap_isequal(parent->cpuset, parent->parent->cpuset))
-	    break;
-	}
-	parent = parent->parent;
-      }
+  hwloc_debug_bitmap("  will attach PCI bus to cpuset %s\n", cpuset);
 
-      if (!forced)
-	parent = hwloc_pci_fixup_hostbridge_parent(topology, hostbridge, parent);
-    }
+  parent = hwloc_find_insert_io_parent_by_complete_cpuset(topology, cpuset);
+  if (parent) {
+    if (!noquirks)
+      /* We found a valid parent. Check that the OS didn't report invalid locality */
+      parent = hwloc_pci_fixup_busid_parent(topology, busid, parent);
   } else {
-    /* Failed to create the Group, attach to the root object instead */
+    /* Fallback to root */
     parent = hwloc_get_root_obj(topology);
   }
 
   hwloc_bitmap_free(cpuset);
-
   return parent;
 }
 
 int
-hwloc_insert_pci_device_list(struct hwloc_backend *backend,
-			     struct hwloc_obj *first_obj)
+hwloc_pcidisc_tree_attach(struct hwloc_topology *topology, struct hwloc_obj *tree)
 {
-  struct hwloc_topology *topology = backend->topology;
-  struct hwloc_obj fakeparent;
-  struct hwloc_obj *obj;
-  unsigned current_hostbridge;
+  enum hwloc_type_filter_e bfilter;
 
-  if (!first_obj)
+  if (!tree)
     /* found nothing, exit */
     return 0;
 
-  /* first, organise object as tree under a fake parent object */
-  fakeparent.parent = NULL;
-  fakeparent.io_first_child = NULL;
-  while (first_obj) {
-    obj = first_obj;
-    first_obj = obj->next_sibling;
-    hwloc_pci_add_object(&fakeparent, obj);
-  }
-
 #ifdef HWLOC_DEBUG
-  hwloc_debug("%s", "\nPCI hierarchy under fake parent:\n");
-  hwloc_pci_traverse(NULL, &fakeparent, hwloc_pci_traverse_print_cb);
+  hwloc_debug("%s", "\nPCI hierarchy:\n");
+  hwloc_pci_traverse(NULL, tree, hwloc_pci_traverse_print_cb);
   hwloc_debug("%s", "\n");
 #endif
 
-  /* walk the hierarchy, and lookup OS devices */
-  hwloc_pci_traverse(backend, &fakeparent, hwloc_pci_traverse_lookuposdevices_cb);
+  bfilter = topology->type_filter[HWLOC_OBJ_BRIDGE];
+  if (bfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
+    tree = hwloc_pcidisc_add_hostbridges(topology, tree);
+  }
 
-  /*
-   * fakeparent lists all objects connected to any upstream bus in the machine.
-   * We now create one real hostbridge object per upstream bus.
-   * It's not actually a PCI device so we have to create it.
-   */
-  current_hostbridge = 0;
-  while (fakeparent.io_first_child) {
-    /* start a new host bridge */
-    struct hwloc_obj *hostbridge = hwloc_alloc_setup_object(HWLOC_OBJ_BRIDGE, current_hostbridge++);
-    struct hwloc_obj **dstnextp = &hostbridge->io_first_child;
-    struct hwloc_obj **srcnextp = &fakeparent.io_first_child;
-    struct hwloc_obj *child = *srcnextp;
+  while (tree) {
+    struct hwloc_obj *obj, *pciobj;
     struct hwloc_obj *parent;
-    unsigned short current_domain = child->attr->pcidev.domain;
-    unsigned char current_bus = child->attr->pcidev.bus;
-    unsigned char current_subordinate = current_bus;
+    struct hwloc_pci_locality_s *loc;
+    unsigned domain, bus_min, bus_max;
 
-    hwloc_debug("Starting new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+    obj = tree;
 
-  next_child:
-    /* remove next child from fakeparent */
-    *srcnextp = child->next_sibling;
-    /* append it to hostbridge */
-    *dstnextp = child;
-    child->parent = hostbridge;
-    child->next_sibling = NULL;
-    dstnextp = &child->next_sibling;
+    /* hostbridges don't have a PCI busid for looking up locality, use their first child */
+    if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+      pciobj = obj->io_first_child;
+    else
+      pciobj = obj;
+    /* now we have a pci device or a pci bridge */
+    assert(pciobj->type == HWLOC_OBJ_PCI_DEVICE
+	   || (pciobj->type == HWLOC_OBJ_BRIDGE && pciobj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
 
-    /* compute hostbridge secondary/subordinate buses */
-    if (child->type == HWLOC_OBJ_BRIDGE
-	&& child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
-      current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+    if (obj->type == HWLOC_OBJ_BRIDGE) {
+      domain = obj->attr->bridge.downstream.pci.domain;
+      bus_min = obj->attr->bridge.downstream.pci.secondary_bus;
+      bus_max = obj->attr->bridge.downstream.pci.subordinate_bus;
+    } else {
+      domain = pciobj->attr->pcidev.domain;
+      bus_min = pciobj->attr->pcidev.bus;
+      bus_max = pciobj->attr->pcidev.bus;
+    }
 
-    /* use next child if it has the same domains/bus */
-    child = *srcnextp;
-    if (child
-	&& child->attr->pcidev.domain == current_domain
-	&& child->attr->pcidev.bus == current_bus)
-      goto next_child;
+    /* find where to attach that PCI bus */
+    parent = hwloc__pci_find_busid_parent(topology, &pciobj->attr->pcidev);
 
-    /* finish setting up this hostbridge */
-    hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
-    hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
-    hostbridge->attr->bridge.downstream.pci.domain = current_domain;
-    hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
-    hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
-    hwloc_debug("New PCI hostbridge %04x:[%02x-%02x]\n",
-		current_domain, current_bus, current_subordinate);
+    /* reuse the previous locality if possible */
+    if (topology->last_pci_locality
+	&& parent == topology->last_pci_locality->parent
+	&& domain == topology->last_pci_locality->domain
+	&& (bus_min == topology->last_pci_locality->bus_max
+	    || bus_min == topology->last_pci_locality->bus_max+1)) {
+      hwloc_debug("  Reusing PCI locality up to bus %04x:%02x\n",
+		  domain, bus_max);
+      topology->last_pci_locality->bus_max = bus_max;
+      goto done;
+    }
 
-    /* attach the hostbridge where it belongs */
-    parent = hwloc_pci_find_hostbridge_parent(topology, backend, hostbridge);
-    hwloc_insert_object_by_parent(topology, parent, hostbridge);
+    loc = malloc(sizeof(*loc));
+    if (!loc) {
+      /* fallback to attaching to root */
+      parent = hwloc_get_root_obj(topology);
+      goto done;
+    }
+
+    loc->domain = domain;
+    loc->bus_min = bus_min;
+    loc->bus_max = bus_max;
+    loc->parent = parent;
+    loc->cpuset = hwloc_bitmap_dup(parent->cpuset);
+    if (!loc->cpuset) {
+      /* fallback to attaching to root */
+      free(loc);
+      parent = hwloc_get_root_obj(topology);
+      goto done;
+    }
+
+    hwloc_debug("Adding PCI locality %s P#%u for bus %04x:[%02x:%02x]\n",
+		hwloc_obj_type_string(parent->type), parent->os_index, loc->domain, loc->bus_min, loc->bus_max);
+    if (topology->last_pci_locality) {
+      loc->prev = topology->last_pci_locality;
+      loc->next = NULL;
+      topology->last_pci_locality->next = loc;
+      topology->last_pci_locality = loc;
+    } else {
+      loc->prev = NULL;
+      loc->next = NULL;
+      topology->first_pci_locality = loc;
+      topology->last_pci_locality = loc;
+    }
+
+  done:
+    /* dequeue this object */
+    tree = obj->next_sibling;
+    obj->next_sibling = NULL;
+    hwloc_insert_object_by_parent(topology, parent, obj);
   }
 
-  return 1;
+  return 0;
 }
 
+
+/*********************************
+ * Finding PCI objects or parents
+ */
+
+struct hwloc_obj *
+hwloc_pci_find_parent_by_busid(struct hwloc_topology *topology,
+			       unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  struct hwloc_pcidev_attr_s busid;
+  hwloc_obj_t parent;
+
+  /* try to find that exact busid */
+  parent = hwloc_pci_find_by_busid(topology, domain, bus, dev, func);
+  if (parent)
+    return parent;
+
+  /* try to find the locality of that bus instead */
+  busid.domain = domain;
+  busid.bus = bus;
+  busid.dev = dev;
+  busid.func = func;
+  return hwloc__pci_find_busid_parent(topology, &busid);
+}
+
+/* return the smallest object that contains the desired busid */
+static struct hwloc_obj *
+hwloc__pci_find_by_busid(hwloc_obj_t parent,
+			 unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t child;
+
+  for_each_io_child(child, parent) {
+    if (child->type == HWLOC_OBJ_PCI_DEVICE
+	|| (child->type == HWLOC_OBJ_BRIDGE
+	    && child->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
+      if (child->attr->pcidev.domain == domain
+	  && child->attr->pcidev.bus == bus
+	  && child->attr->pcidev.dev == dev
+	  && child->attr->pcidev.func == func)
+	/* that's the right bus id */
+	return child;
+      if (child->attr->pcidev.domain > domain
+	  || (child->attr->pcidev.domain == domain
+	      && child->attr->pcidev.bus > bus))
+	/* bus id too high, won't find anything later, return parent */
+	return parent;
+      if (child->type == HWLOC_OBJ_BRIDGE
+	  && child->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+	  && child->attr->bridge.downstream.pci.domain == domain
+	  && child->attr->bridge.downstream.pci.secondary_bus <= bus
+	  && child->attr->bridge.downstream.pci.subordinate_bus >= bus)
+	/* not the right bus id, but it's included in the bus below that bridge */
+	return hwloc__pci_find_by_busid(child, domain, bus, dev, func);
+
+    } else if (child->type == HWLOC_OBJ_BRIDGE
+	       && child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI
+	       && child->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+	       /* non-PCI to PCI bridge, just look at the subordinate bus */
+	       && child->attr->bridge.downstream.pci.domain == domain
+	       && child->attr->bridge.downstream.pci.secondary_bus <= bus
+	       && child->attr->bridge.downstream.pci.subordinate_bus >= bus) {
+      /* contains our bus, recurse */
+      return hwloc__pci_find_by_busid(child, domain, bus, dev, func);
+    }
+  }
+  /* didn't find anything, return parent */
+  return parent;
+}
+
+struct hwloc_obj *
+hwloc_pci_find_by_busid(struct hwloc_topology *topology,
+			unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  struct hwloc_pci_locality_s *loc;
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+  hwloc_obj_t parent = NULL;
+
+  hwloc_debug("pcidisc looking for bus id %04x:%02x:%02x.%01x\n", domain, bus, dev, func);
+  loc = topology->first_pci_locality;
+  while (loc) {
+    if (loc->domain == domain && loc->bus_min <= bus && loc->bus_max >= bus) {
+      parent = loc->parent;
+      assert(parent);
+      hwloc_debug("  found pci locality for %04x:[%02x:%02x]\n",
+		  loc->domain, loc->bus_min, loc->bus_max);
+      break;
+    }
+    loc = loc->next;
+  }
+  /* if we failed to insert localities, look at root too */
+  if (!parent)
+    parent = root;
+
+  hwloc_debug("  looking for bus %04x:%02x:%02x.%01x below %s P#%u\n",
+	      domain, bus, dev, func,
+	      hwloc_obj_type_string(parent->type), parent->os_index);
+  parent = hwloc__pci_find_by_busid(parent, domain, bus, dev, func);
+  if (parent == root) {
+    hwloc_debug("  found nothing better than root object, ignoring\n");
+    return NULL;
+  } else {
+    if (parent->type == HWLOC_OBJ_PCI_DEVICE
+	|| (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI))
+      hwloc_debug("  found busid %04x:%02x:%02x.%01x\n",
+		  parent->attr->pcidev.domain, parent->attr->pcidev.bus,
+		  parent->attr->pcidev.dev, parent->attr->pcidev.func);
+    else
+      hwloc_debug("  found parent %s P#%u\n",
+		  hwloc_obj_type_string(parent->type), parent->os_index);
+    return parent;
+  }
+}
+
+
+/*******************************
+ * Parsing the PCI Config Space
+ */
+
 #define HWLOC_PCI_STATUS 0x06
 #define HWLOC_PCI_STATUS_CAP_LIST 0x10
 #define HWLOC_PCI_CAPABILITY_LIST 0x34
@@ -392,7 +773,7 @@ hwloc_insert_pci_device_list(struct hwloc_backend *backend,
 #define HWLOC_PCI_CAP_LIST_NEXT 1
 
 unsigned
-hwloc_pci_find_cap(const unsigned char *config, unsigned cap)
+hwloc_pcidisc_find_cap(const unsigned char *config, unsigned cap)
 {
   unsigned char seen[256] = { 0 };
   unsigned char ptr; /* unsigned char to make sure we stay within the 256-byte config space */
@@ -424,8 +805,8 @@ hwloc_pci_find_cap(const unsigned char *config, unsigned cap)
 #define HWLOC_PCI_EXP_LNKSTA_WIDTH 0x03f0
 
 int
-hwloc_pci_find_linkspeed(const unsigned char *config,
-			 unsigned offset, float *linkspeed)
+hwloc_pcidisc_find_linkspeed(const unsigned char *config,
+			     unsigned offset, float *linkspeed)
 {
   unsigned linksta, speed, width;
   float lanespeed;
@@ -436,47 +817,244 @@ hwloc_pci_find_linkspeed(const unsigned char *config,
   /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
    * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
    * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
+   * PCIe Gen4 = 16 GT/s signal-rate per lane with 128/130 encoding = 2   GB/s data-rate per lane
+   * PCIe Gen5 = 32 GT/s signal-rate per lane with 128/130 encoding = 4   GB/s data-rate per lane
    */
-  lanespeed = speed <= 2 ? 2.5 * speed * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
-  *linkspeed = lanespeed * width / 8; /* GB/s */
+
+  /* lanespeed in Gbit/s */
+  if (speed <= 2)
+    lanespeed = 2.5f * speed * 0.8f;
+  else
+    lanespeed = 8.0f * (1<<(speed-3)) * 128/130; /* assume Gen6 will be 64 GT/s and so on */
+
+  /* linkspeed in GB/s */
+  *linkspeed = lanespeed * width / 8;
   return 0;
 }
 
 #define HWLOC_PCI_HEADER_TYPE 0x0e
 #define HWLOC_PCI_HEADER_TYPE_BRIDGE 1
 #define HWLOC_PCI_CLASS_BRIDGE_PCI 0x0604
-#define HWLOC_PCI_PRIMARY_BUS 0x18
-#define HWLOC_PCI_SECONDARY_BUS 0x19
-#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
 
-int
-hwloc_pci_prepare_bridge(hwloc_obj_t obj,
-			 const unsigned char *config)
+hwloc_obj_type_t
+hwloc_pcidisc_check_bridge_type(unsigned device_class, const unsigned char *config)
 {
   unsigned char headertype;
-  unsigned isbridge;
-  struct hwloc_pcidev_attr_s *pattr = &obj->attr->pcidev;
-  struct hwloc_bridge_attr_s *battr;
+
+  if (device_class != HWLOC_PCI_CLASS_BRIDGE_PCI)
+    return HWLOC_OBJ_PCI_DEVICE;
 
   headertype = config[HWLOC_PCI_HEADER_TYPE] & 0x7f;
-  isbridge = (pattr->class_id == HWLOC_PCI_CLASS_BRIDGE_PCI
-	      && headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE);
+  return (headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE)
+    ? HWLOC_OBJ_BRIDGE : HWLOC_OBJ_PCI_DEVICE;
+}
 
-  if (!isbridge)
-    return 0;
+#define HWLOC_PCI_PRIMARY_BUS 0x18
+#define HWLOC_PCI_SECONDARY_BUS 0x19
+#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
 
-  battr = &obj->attr->bridge;
+int
+hwloc_pcidisc_find_bridge_buses(unsigned domain, unsigned bus, unsigned dev, unsigned func,
+				unsigned *secondary_busp, unsigned *subordinate_busp,
+				const unsigned char *config)
+{
+  unsigned secondary_bus, subordinate_bus;
 
-  if (config[HWLOC_PCI_PRIMARY_BUS] != pattr->bus)
+  if (config[HWLOC_PCI_PRIMARY_BUS] != bus) {
+    /* Sometimes the config space contains 00 instead of the actual primary bus number.
+     * Always trust the bus ID because it was built by the system which has more information
+     * to workaround such problems (e.g. ACPI information about PCI parent/children).
+     */
     hwloc_debug("  %04x:%02x:%02x.%01x bridge with (ignored) invalid PCI_PRIMARY_BUS %02x\n",
-		pattr->domain, pattr->bus, pattr->dev, pattr->func, config[HWLOC_PCI_PRIMARY_BUS]);
+		domain, bus, dev, func, config[HWLOC_PCI_PRIMARY_BUS]);
+  }
 
-  obj->type = HWLOC_OBJ_BRIDGE;
-  battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
-  battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
-  battr->downstream.pci.domain = pattr->domain;
-  battr->downstream.pci.secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
-  battr->downstream.pci.subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
+  secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
+  subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
 
+  if (secondary_bus <= bus
+      || subordinate_bus <= bus
+      || secondary_bus > subordinate_bus) {
+    /* This should catch most cases of invalid bridge information
+     * (e.g. 00 for secondary and subordinate).
+     * Ideally we would also check that [secondary-subordinate] is included
+     * in the parent bridge [secondary+1:subordinate]. But that's hard to do
+     * because objects may be discovered out of order (especially in the fsroot case).
+     */
+    hwloc_debug("  %04x:%02x:%02x.%01x bridge has invalid secondary-subordinate buses [%02x-%02x]\n",
+		domain, bus, dev, func,
+		secondary_bus, subordinate_bus);
+    return -1;
+  }
+
+  *secondary_busp = secondary_bus;
+  *subordinate_busp = subordinate_bus;
   return 0;
 }
+
+
+/****************
+ * Class Strings
+ */
+
+const char *
+hwloc_pci_class_string(unsigned short class_id)
+{
+  /* See https://pci-ids.ucw.cz/read/PD/ */
+  switch ((class_id & 0xff00) >> 8) {
+    case 0x00:
+      switch (class_id) {
+	case 0x0001: return "VGA";
+      }
+      break;
+    case 0x01:
+      switch (class_id) {
+	case 0x0100: return "SCSI";
+	case 0x0101: return "IDE";
+	case 0x0102: return "Floppy";
+	case 0x0103: return "IPI";
+	case 0x0104: return "RAID";
+	case 0x0105: return "ATA";
+	case 0x0106: return "SATA";
+	case 0x0107: return "SAS";
+	case 0x0108: return "NVMExp";
+      }
+      return "Storage";
+    case 0x02:
+      switch (class_id) {
+	case 0x0200: return "Ethernet";
+	case 0x0201: return "TokenRing";
+	case 0x0202: return "FDDI";
+	case 0x0203: return "ATM";
+	case 0x0204: return "ISDN";
+	case 0x0205: return "WorldFip";
+	case 0x0206: return "PICMG";
+	case 0x0207: return "InfiniBand";
+	case 0x0208: return "Fabric";
+      }
+      return "Network";
+    case 0x03:
+      switch (class_id) {
+	case 0x0300: return "VGA";
+	case 0x0301: return "XGA";
+	case 0x0302: return "3D";
+      }
+      return "Display";
+    case 0x04:
+      switch (class_id) {
+	case 0x0400: return "MultimediaVideo";
+	case 0x0401: return "MultimediaAudio";
+	case 0x0402: return "Telephony";
+	case 0x0403: return "AudioDevice";
+      }
+      return "Multimedia";
+    case 0x05:
+      switch (class_id) {
+	case 0x0500: return "RAM";
+	case 0x0501: return "Flash";
+      }
+      return "Memory";
+    case 0x06:
+      switch (class_id) {
+	case 0x0600: return "HostBridge";
+	case 0x0601: return "ISABridge";
+	case 0x0602: return "EISABridge";
+	case 0x0603: return "MicroChannelBridge";
+	case 0x0604: return "PCIBridge";
+	case 0x0605: return "PCMCIABridge";
+	case 0x0606: return "NubusBridge";
+	case 0x0607: return "CardBusBridge";
+	case 0x0608: return "RACEwayBridge";
+	case 0x0609: return "SemiTransparentPCIBridge";
+	case 0x060a: return "InfiniBandPCIHostBridge";
+      }
+      return "Bridge";
+    case 0x07:
+      switch (class_id) {
+	case 0x0700: return "Serial";
+	case 0x0701: return "Parallel";
+	case 0x0702: return "MultiportSerial";
+	case 0x0703: return "Model";
+	case 0x0704: return "GPIB";
+	case 0x0705: return "SmartCard";
+      }
+      return "Communication";
+    case 0x08:
+      switch (class_id) {
+	case 0x0800: return "PIC";
+	case 0x0801: return "DMA";
+	case 0x0802: return "Timer";
+	case 0x0803: return "RTC";
+	case 0x0804: return "PCIHotPlug";
+	case 0x0805: return "SDHost";
+	case 0x0806: return "IOMMU";
+      }
+      return "SystemPeripheral";
+    case 0x09:
+      switch (class_id) {
+	case 0x0900: return "Keyboard";
+	case 0x0901: return "DigitizerPen";
+	case 0x0902: return "Mouse";
+	case 0x0903: return "Scanern";
+	case 0x0904: return "Gameport";
+      }
+      return "Input";
+    case 0x0a:
+      return "DockingStation";
+    case 0x0b:
+      switch (class_id) {
+	case 0x0b00: return "386";
+	case 0x0b01: return "486";
+	case 0x0b02: return "Pentium";
+/* 0x0b03 and 0x0b04 might be Pentium and P6 ? */
+	case 0x0b10: return "Alpha";
+	case 0x0b20: return "PowerPC";
+	case 0x0b30: return "MIPS";
+	case 0x0b40: return "Co-Processor";
+      }
+      return "Processor";
+    case 0x0c:
+      switch (class_id) {
+	case 0x0c00: return "FireWire";
+	case 0x0c01: return "ACCESS";
+	case 0x0c02: return "SSA";
+	case 0x0c03: return "USB";
+	case 0x0c04: return "FibreChannel";
+	case 0x0c05: return "SMBus";
+	case 0x0c06: return "InfiniBand";
+	case 0x0c07: return "IPMI-SMIC";
+	case 0x0c08: return "SERCOS";
+	case 0x0c09: return "CANBUS";
+      }
+      return "SerialBus";
+    case 0x0d:
+      switch (class_id) {
+	case 0x0d00: return "IRDA";
+	case 0x0d01: return "ConsumerIR";
+	case 0x0d10: return "RF";
+	case 0x0d11: return "Bluetooth";
+	case 0x0d12: return "Broadband";
+	case 0x0d20: return "802.1a";
+	case 0x0d21: return "802.1b";
+      }
+      return "Wireless";
+    case 0x0e:
+      switch (class_id) {
+	case 0x0e00: return "I2O";
+      }
+      return "Intelligent";
+    case 0x0f:
+      return "Satellite";
+    case 0x10:
+      return "Encryption";
+    case 0x11:
+      return "SignalProcessing";
+    case 0x12:
+      return "ProcessingAccelerator";
+    case 0x13:
+      return "Instrumentation";
+    case 0x40:
+      return "Co-Processor";
+  }
+  return "Other";
+}
diff --git a/ext/hwloc/hwloc/shmem.c b/ext/hwloc/hwloc/shmem.c
new file mode 100644
index 000000000..94d55eef7
--- /dev/null
+++ b/ext/hwloc/hwloc/shmem.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright © 2017-2019 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "hwloc/shmem.h"
+#include "private/private.h"
+
+#ifndef HWLOC_WIN_SYS
+
+#include <sys/mman.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <assert.h>
+
+#define HWLOC_SHMEM_HEADER_VERSION 1
+
+struct hwloc_shmem_header {
+  uint32_t header_version; /* sanity check */
+  uint32_t header_length; /* where the actual topology starts in the file/mapping */
+  uint64_t mmap_address; /* virtual address to pass to mmap */
+  uint64_t mmap_length; /* length to pass to mmap (includes the header) */
+};
+
+#define HWLOC_SHMEM_MALLOC_ALIGN 8UL
+
+static void *
+tma_shmem_malloc(struct hwloc_tma * tma,
+		 size_t length)
+{
+  void *current = tma->data;
+  tma->data = (char*)tma->data  + ((length + HWLOC_SHMEM_MALLOC_ALIGN - 1) & ~(HWLOC_SHMEM_MALLOC_ALIGN - 1));
+  return current;
+
+}
+
+static void *
+tma_get_length_malloc(struct hwloc_tma * tma,
+		      size_t length)
+{
+  size_t *tma_length = tma->data;
+  *tma_length += (length + HWLOC_SHMEM_MALLOC_ALIGN - 1) & ~(HWLOC_SHMEM_MALLOC_ALIGN - 1);
+  return malloc(length);
+
+}
+
+int
+hwloc_shmem_topology_get_length(hwloc_topology_t topology,
+				size_t *lengthp,
+				unsigned long flags)
+{
+  hwloc_topology_t new;
+  struct hwloc_tma tma;
+  size_t length = 0;
+  unsigned long pagesize = hwloc_getpagesize(); /* round-up to full page for mmap() */
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tma.malloc = tma_get_length_malloc;
+  tma.dontfree = 0;
+  tma.data = &length;
+
+  err = hwloc__topology_dup(&new, topology, &tma);
+  if (err < 0)
+    return err;
+  hwloc_topology_destroy(new);
+
+  *lengthp = (sizeof(struct hwloc_shmem_header) + length + pagesize - 1) & ~(pagesize - 1);
+  return 0;
+}
+
+int
+hwloc_shmem_topology_write(hwloc_topology_t topology,
+			   int fd, hwloc_uint64_t fileoffset,
+			   void *mmap_address, size_t length,
+			   unsigned long flags)
+{
+  hwloc_topology_t new;
+  struct hwloc_tma tma;
+  struct hwloc_shmem_header header;
+  void *mmap_res;
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* refresh old topology distances so that we don't uselessly duplicate invalid distances
+   * without being able to free() them.
+   */
+  hwloc_internal_distances_refresh(topology);
+
+  header.header_version = HWLOC_SHMEM_HEADER_VERSION;
+  header.header_length = sizeof(header);
+  header.mmap_address = (uintptr_t) mmap_address;
+  header.mmap_length = length;
+
+  err = lseek(fd, fileoffset, SEEK_SET);
+  if (err < 0)
+    return -1;
+
+  err = write(fd, &header, sizeof(header));
+  if (err != sizeof(header))
+    return -1;
+
+  err = ftruncate(fd, fileoffset + length);
+  if (err < 0)
+    return -1;
+
+  mmap_res = mmap(mmap_address, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, fileoffset);
+  if (mmap_res == MAP_FAILED)
+    return -1;
+  if (mmap_res != mmap_address) {
+    munmap(mmap_res, length);
+    errno = EBUSY;
+    return -1;
+  }
+
+  tma.malloc = tma_shmem_malloc;
+  tma.dontfree = 1;
+  tma.data = (char *)mmap_res + sizeof(header);
+  err = hwloc__topology_dup(&new, topology, &tma);
+  if (err < 0)
+    return err;
+  assert((char*)new == (char*)mmap_address + sizeof(header));
+
+  assert((char *)mmap_res <= (char *)mmap_address + length);
+
+  /* now refresh the new distances so that adopters can use them without refreshing the R/O shmem mapping */
+  hwloc_internal_distances_refresh(new);
+
+  /* topology is saved, release resources now */
+  munmap(mmap_address, length);
+  hwloc_components_fini();
+
+  return 0;
+}
+
+int
+hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp,
+			   int fd, hwloc_uint64_t fileoffset,
+			   void *mmap_address, size_t length,
+			   unsigned long flags)
+{
+  hwloc_topology_t new, old;
+  struct hwloc_shmem_header header;
+  void *mmap_res;
+  int err;
+
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  err = lseek(fd, fileoffset, SEEK_SET);
+  if (err < 0)
+    return -1;
+
+  err = read(fd, &header, sizeof(header));
+  if (err != sizeof(header))
+    return -1;
+
+  if (header.header_version != HWLOC_SHMEM_HEADER_VERSION
+      || header.header_length != sizeof(header)
+      || header.mmap_address != (uintptr_t) mmap_address
+      || header.mmap_length != length) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  mmap_res = mmap(mmap_address, length, PROT_READ, MAP_SHARED, fd, fileoffset);
+  if (mmap_res == MAP_FAILED)
+    return -1;
+  if (mmap_res != mmap_address) {
+    errno = EBUSY;
+    goto out_with_mmap;
+  }
+
+  old = (hwloc_topology_t)((char*)mmap_address + sizeof(header));
+  if (hwloc_topology_abi_check(old) < 0) {
+    errno = EINVAL;
+    goto out_with_mmap;
+  }
+
+  /* enforced by dup() inside shmem_topology_write() */
+  assert(old->is_loaded);
+  assert(old->backends == NULL);
+  assert(old->get_pci_busid_cpuset_backend == NULL);
+
+  hwloc_components_init();
+
+  /* duplicate the topology object so that we ca change use local binding_hooks
+   * (those are likely not mapped at the same location in both processes).
+   */
+  new = malloc(sizeof(struct hwloc_topology));
+  if (!new)
+    goto out_with_components;
+  memcpy(new, old, sizeof(*old));
+  new->tma = NULL;
+  new->adopted_shmem_addr = mmap_address;
+  new->adopted_shmem_length = length;
+  new->topology_abi = HWLOC_TOPOLOGY_ABI;
+  /* setting binding hooks will touch support arrays, so duplicate them too.
+   * could avoid that by requesting a R/W mmap
+   */
+  new->support.discovery = malloc(sizeof(*new->support.discovery));
+  new->support.cpubind = malloc(sizeof(*new->support.cpubind));
+  new->support.membind = malloc(sizeof(*new->support.membind));
+  if (!new->support.discovery || !new->support.cpubind || !new->support.membind)
+    goto out_with_support;
+  memcpy(new->support.discovery, old->support.discovery, sizeof(*new->support.discovery));
+  memcpy(new->support.cpubind, old->support.cpubind, sizeof(*new->support.cpubind));
+  memcpy(new->support.membind, old->support.membind, sizeof(*new->support.membind));
+  hwloc_set_binding_hooks(new);
+  /* clear userdata callbacks pointing to the writer process' functions */
+  new->userdata_export_cb = NULL;
+  new->userdata_import_cb = NULL;
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(new);
+
+  *topologyp = new;
+  return 0;
+
+ out_with_support:
+  free(new->support.discovery);
+  free(new->support.cpubind);
+  free(new->support.membind);
+  free(new);
+ out_with_components:
+  hwloc_components_fini();
+ out_with_mmap:
+  munmap(mmap_res, length);
+  return -1;
+}
+
+void
+hwloc__topology_disadopt(hwloc_topology_t topology)
+{
+  hwloc_components_fini();
+  munmap(topology->adopted_shmem_addr, topology->adopted_shmem_length);
+  free(topology->support.discovery);
+  free(topology->support.cpubind);
+  free(topology->support.membind);
+  free(topology);
+}
+
+#else /* HWLOC_WIN_SYS */
+
+int
+hwloc_shmem_topology_get_length(hwloc_topology_t topology __hwloc_attribute_unused,
+				size_t *lengthp __hwloc_attribute_unused,
+				unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_shmem_topology_write(hwloc_topology_t topology __hwloc_attribute_unused,
+			   int fd __hwloc_attribute_unused, hwloc_uint64_t fileoffset __hwloc_attribute_unused,
+			   void *mmap_address __hwloc_attribute_unused, size_t length __hwloc_attribute_unused,
+			   unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp __hwloc_attribute_unused,
+			   int fd __hwloc_attribute_unused, hwloc_uint64_t fileoffset __hwloc_attribute_unused,
+			   void *mmap_address __hwloc_attribute_unused, size_t length __hwloc_attribute_unused,
+			   unsigned long flags __hwloc_attribute_unused)
+{
+  errno = ENOSYS;
+  return -1;
+}
+
+void
+hwloc__topology_disadopt(hwloc_topology_t topology __hwloc_attribute_unused)
+{
+}
+
+#endif /* HWLOC_WIN_SYS */
diff --git a/ext/hwloc/hwloc/topology-bgq.cb b/ext/hwloc/hwloc/topology-bgq.cb
deleted file mode 100644
index 3998f31ab..000000000
--- a/ext/hwloc/hwloc/topology-bgq.cb
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright © 2013-2015 Inria.  All rights reserved.
- * See COPYING in top-level directory.
- */
-
-#include <private/autogen/config.h>
-
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-
-#include <stdlib.h>
-#include <pthread.h>
-#include <sys/utsname.h>
-#include <spi/include/kernel/location.h>
-#include <spi/include/kernel/process.h>
-
-static int
-hwloc_look_bgq(struct hwloc_backend *backend)
-{
-  struct hwloc_topology *topology = backend->topology;
-  unsigned i;
-  const char *env;
-
-  if (!topology->levels[0][0]->cpuset) {
-    /* Nobody created objects yet, setup everything */
-    hwloc_bitmap_t set;
-    hwloc_obj_t obj;
-
-#define HWLOC_BGQ_CORES 17 /* spare core ignored for now */
-
-    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-    /* mark the 17th core (OS-reserved) as disallowed */
-    hwloc_bitmap_clr_range(topology->levels[0][0]->allowed_cpuset, (HWLOC_BGQ_CORES-1)*4, HWLOC_BGQ_CORES*4-1);
-
-    env = getenv("BG_THREADMODEL");
-    if (!env || atoi(env) != 2) {
-      /* process cannot use cores/threads outside of its Kernel_ThreadMask() */
-      uint64_t bgmask = Kernel_ThreadMask(Kernel_MyTcoord());
-      /* the mask is reversed, manually reverse it */
-      for(i=0; i<64; i++)
-	if (((bgmask >> i) & 1) == 0)
-	  hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, 63-i);
-    }
-
-    /* a single memory bank */
-    obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
-    set = hwloc_bitmap_alloc();
-    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
-    obj->cpuset = set;
-    set = hwloc_bitmap_alloc();
-    hwloc_bitmap_set(set, 0);
-    obj->nodeset = set;
-    obj->memory.local_memory = 16ULL*1024*1024*1024ULL;
-    hwloc_insert_object_by_cpuset(topology, obj);
-
-    /* package */
-    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, 0);
-    set = hwloc_bitmap_alloc();
-    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
-    obj->cpuset = set;
-    hwloc_obj_add_info(obj, "CPUModel", "IBM PowerPC A2");
-    hwloc_insert_object_by_cpuset(topology, obj);
-
-    /* shared L2 */
-    obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
-    obj->cpuset = hwloc_bitmap_dup(set);
-    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
-    obj->attr->cache.depth = 2;
-    obj->attr->cache.size = 32*1024*1024;
-    obj->attr->cache.linesize = 128;
-    obj->attr->cache.associativity = 16;
-    hwloc_insert_object_by_cpuset(topology, obj);
-
-    /* Cores */
-    for(i=0; i<HWLOC_BGQ_CORES; i++) {
-      /* Core */
-      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
-      set = hwloc_bitmap_alloc();
-      hwloc_bitmap_set_range(set, i*4, i*4+3);
-      obj->cpuset = set;
-      hwloc_insert_object_by_cpuset(topology, obj);
-      /* L1d */
-      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
-      obj->cpuset = hwloc_bitmap_dup(set);
-      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
-      obj->attr->cache.depth = 1;
-      obj->attr->cache.size = 16*1024;
-      obj->attr->cache.linesize = 64;
-      obj->attr->cache.associativity = 8;
-      hwloc_insert_object_by_cpuset(topology, obj);
-      /* L1i */
-      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
-      obj->cpuset = hwloc_bitmap_dup(set);
-      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
-      obj->attr->cache.depth = 1;
-      obj->attr->cache.size = 16*1024;
-      obj->attr->cache.linesize = 64;
-      obj->attr->cache.associativity = 4;
-      hwloc_insert_object_by_cpuset(topology, obj);
-      /* there's also a L1p "prefetch cache" of 4kB with 128B lines */
-    }
-
-    /* PUs */
-    hwloc_setup_pu_level(topology, HWLOC_BGQ_CORES*4);
-  }
-
-  /* Add BGQ specific information */
-
-  hwloc_obj_add_info(topology->levels[0][0], "Backend", "BGQ");
-  if (topology->is_thissystem)
-    hwloc_add_uname_info(topology, NULL);
-  return 1;
-}
-
-static int
-hwloc_bgq_get_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
-{
-  unsigned pu;
-  cpu_set_t bg_set;
-  int err;
-
-  if (topology->pid) {
-    errno = ENOSYS;
-    return -1;
-  }
-  err = pthread_getaffinity_np(thread, sizeof(bg_set), &bg_set);
-  if (err) {
-    errno = err;
-    return -1;
-  }
-  for(pu=0; pu<64; pu++)
-    if (CPU_ISSET(pu, &bg_set)) {
-      /* the binding cannot contain multiple PUs */
-      hwloc_bitmap_only(hwloc_set, pu);
-      break;
-    }
-  return 0;
-}
-
-static int
-hwloc_bgq_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
-{
-  if (topology->pid) {
-    errno = ENOSYS;
-    return -1;
-  }
-  hwloc_bitmap_only(hwloc_set, Kernel_ProcessorID());
-  return 0;
-}
-
-static int
-hwloc_bgq_set_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  unsigned pu;
-  cpu_set_t bg_set;
-  int err;
-
-  if (topology->pid) {
-    errno = ENOSYS;
-    return -1;
-  }
-  /* the binding cannot contain multiple PUs.
-   * keep the first PU only, and error out if STRICT.
-   */
-  if (hwloc_bitmap_weight(hwloc_set) != 1) {
-    if ((flags & HWLOC_CPUBIND_STRICT)) {
-      errno = ENOSYS;
-      return -1;
-    }
-  }
-  pu = hwloc_bitmap_first(hwloc_set);
-  CPU_ZERO(&bg_set);
-  CPU_SET(pu, &bg_set);
-  err = pthread_setaffinity_np(thread, sizeof(bg_set), &bg_set);
-  if (err) {
-    errno = err;
-    return -1;
-  }
-  return 0;
-}
-
-static int
-hwloc_bgq_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  return hwloc_bgq_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
-}
-
-void
-hwloc_set_bgq_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
-		    struct hwloc_topology_support *support __hwloc_attribute_unused)
-{
-  hooks->set_thisthread_cpubind = hwloc_bgq_set_thisthread_cpubind;
-  hooks->set_thread_cpubind = hwloc_bgq_set_thread_cpubind;
-  hooks->get_thisthread_cpubind = hwloc_bgq_get_thisthread_cpubind;
-  hooks->get_thread_cpubind = hwloc_bgq_get_thread_cpubind;
-  /* threads cannot be bound to more than one PU, so get_last_cpu_location == get_cpubind */
-  hooks->get_thisthread_last_cpu_location = hwloc_bgq_get_thisthread_cpubind;
-  /* hooks->get_thread_last_cpu_location = hwloc_bgq_get_thread_cpubind; */
-}
-
-static struct hwloc_backend *
-hwloc_bgq_component_instantiate(struct hwloc_disc_component *component,
-				const void *_data1 __hwloc_attribute_unused,
-				const void *_data2 __hwloc_attribute_unused,
-				const void *_data3 __hwloc_attribute_unused)
-{
-  struct utsname utsname;
-  struct hwloc_backend *backend;
-  const char *env;
-  int err;
-
-  env = getenv("HWLOC_FORCE_BGQ");
-  if (!env || !atoi(env)) {
-    err = uname(&utsname);
-    if (err || strcmp(utsname.sysname, "CNK") || strcmp(utsname.machine, "BGQ")) {
-      fprintf(stderr, "*** Found unexpected uname sysname `%s' machine `%s'\n", utsname.sysname, utsname.machine);
-      fprintf(stderr, "*** The BGQ backend is only enabled on compute nodes by default (sysname=CNK machine=BGQ)\n");
-      fprintf(stderr, "*** Set HWLOC_FORCE_BGQ=1 in the environment to enforce the BGQ backend anyway.\n");
-      return NULL;
-    }
-  }
-
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-  backend->discover = hwloc_look_bgq;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_bgq_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  "bgq",
-  ~0,
-  hwloc_bgq_component_instantiate,
-  50,
-  NULL
-};
-
-const struct hwloc_component hwloc_bgq_component = {
-  HWLOC_COMPONENT_ABI,
-  NULL, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_bgq_disc_component
-};
diff --git a/ext/hwloc/hwloc/topology-darwin.cb b/ext/hwloc/hwloc/topology-darwin.cb
deleted file mode 100644
index 1062a1d0c..000000000
--- a/ext/hwloc/hwloc/topology-darwin.cb
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
- * Copyright © 2009-2013 Université Bordeaux
- * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
- * See COPYING in top-level directory.
- */
-
-/* Detect topology change: registering for power management changes and check
- * if for example hw.activecpu changed */
-
-/* Apparently, Darwin people do not _want_ to provide binding functions.  */
-
-#include <private/autogen/config.h>
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-
-static int
-hwloc_look_darwin(struct hwloc_backend *backend)
-{
-  struct hwloc_topology *topology = backend->topology;
-  int64_t _nprocs;
-  unsigned nprocs;
-  int64_t _npackages;
-  unsigned i, j, cpu;
-  struct hwloc_obj *obj;
-  size_t size;
-  int64_t l1dcachesize, l1icachesize;
-  int64_t cacheways[2];
-  int64_t l2cachesize;
-  int64_t cachelinesize;
-  int64_t memsize;
-  char cpumodel[64];
-
-  if (topology->levels[0][0]->cpuset)
-    /* somebody discovered things */
-    return 0;
-
-  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-
-  if (hwloc_get_sysctlbyname("hw.ncpu", &_nprocs) || _nprocs <= 0)
-    return -1;
-  nprocs = _nprocs;
-  topology->support.discovery->pu = 1;
-
-  hwloc_debug("%u procs\n", nprocs);
-
-  size = sizeof(cpumodel);
-  if (sysctlbyname("machdep.cpu.brand_string", cpumodel, &size, NULL, 0))
-    cpumodel[0] = '\0';
-
-  if (!hwloc_get_sysctlbyname("hw.packages", &_npackages) && _npackages > 0) {
-    unsigned npackages = _npackages;
-    int64_t _cores_per_package;
-    int64_t _logical_per_package;
-    unsigned logical_per_package;
-
-    hwloc_debug("%u packages\n", npackages);
-
-    if (!hwloc_get_sysctlbyname("machdep.cpu.logical_per_package", &_logical_per_package) && _logical_per_package > 0)
-      logical_per_package = _logical_per_package;
-    else
-      /* Assume the trivia.  */
-      logical_per_package = nprocs / npackages;
-
-    hwloc_debug("%u threads per package\n", logical_per_package);
-
-
-    if (nprocs == npackages * logical_per_package)
-      for (i = 0; i < npackages; i++) {
-        obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, i);
-        obj->cpuset = hwloc_bitmap_alloc();
-        for (cpu = i*logical_per_package; cpu < (i+1)*logical_per_package; cpu++)
-          hwloc_bitmap_set(obj->cpuset, cpu);
-
-        hwloc_debug_1arg_bitmap("package %u has cpuset %s\n",
-                   i, obj->cpuset);
-
-        if (cpumodel[0] != '\0')
-          hwloc_obj_add_info(obj, "CPUModel", cpumodel);
-        hwloc_insert_object_by_cpuset(topology, obj);
-      }
-    else
-      if (cpumodel[0] != '\0')
-        hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
-
-    if (!hwloc_get_sysctlbyname("machdep.cpu.cores_per_package", &_cores_per_package) && _cores_per_package > 0) {
-      unsigned cores_per_package = _cores_per_package;
-      hwloc_debug("%u cores per package\n", cores_per_package);
-
-      if (!(logical_per_package % cores_per_package))
-        for (i = 0; i < npackages * cores_per_package; i++) {
-          obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
-          obj->cpuset = hwloc_bitmap_alloc();
-          for (cpu = i*(logical_per_package/cores_per_package);
-               cpu < (i+1)*(logical_per_package/cores_per_package);
-               cpu++)
-            hwloc_bitmap_set(obj->cpuset, cpu);
-
-          hwloc_debug_1arg_bitmap("core %u has cpuset %s\n",
-                     i, obj->cpuset);
-          hwloc_insert_object_by_cpuset(topology, obj);
-        }
-    }
-  } else
-    if (cpumodel[0] != '\0')
-      hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
-
-  if (hwloc_get_sysctlbyname("hw.l1dcachesize", &l1dcachesize))
-    l1dcachesize = 0;
-
-  if (hwloc_get_sysctlbyname("hw.l1icachesize", &l1icachesize))
-    l1icachesize = 0;
-
-  if (hwloc_get_sysctlbyname("hw.l2cachesize", &l2cachesize))
-    l2cachesize = 0;
-
-  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L1_associativity", &cacheways[0]))
-    cacheways[0] = 0;
-  else if (cacheways[0] == 0xff)
-    cacheways[0] = -1;
-
-  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L2_associativity", &cacheways[1]))
-    cacheways[1] = 0;
-  else if (cacheways[1] == 0xff)
-    cacheways[1] = -1;
-
-  if (hwloc_get_sysctlbyname("hw.cachelinesize", &cachelinesize))
-    cachelinesize = 0;
-
-  if (hwloc_get_sysctlbyname("hw.memsize", &memsize))
-    memsize = 0;
-
-  if (!sysctlbyname("hw.cacheconfig", NULL, &size, NULL, 0)) {
-    unsigned n = size / sizeof(uint32_t);
-    uint64_t *cacheconfig = NULL;
-    uint64_t *cachesize = NULL;
-    uint32_t *cacheconfig32 = NULL;
-
-    cacheconfig = malloc(sizeof(uint64_t) * n);
-    if (NULL == cacheconfig) {
-        goto out;
-    }
-    cachesize = malloc(sizeof(uint64_t) * n);
-    if (NULL == cachesize) {
-        goto out;
-    }
-    cacheconfig32 = malloc(sizeof(uint32_t) * n);
-    if (NULL == cacheconfig32) {
-        goto out;
-    }
-
-    if ((!sysctlbyname("hw.cacheconfig", cacheconfig, &size, NULL, 0))) {
-      /* Yeech. Darwin seemingly has changed from 32bit to 64bit integers for
-       * cacheconfig, with apparently no way for detection. Assume the machine
-       * won't have more than 4 billion cpus */
-      if (cacheconfig[0] > 0xFFFFFFFFUL) {
-        memcpy(cacheconfig32, cacheconfig, size);
-        for (i = 0 ; i < size / sizeof(uint32_t); i++)
-          cacheconfig[i] = cacheconfig32[i];
-      }
-
-      memset(cachesize, 0, sizeof(uint64_t) * n);
-      size = sizeof(uint64_t) * n;
-      if (sysctlbyname("hw.cachesize", cachesize, &size, NULL, 0)) {
-        if (n > 0)
-          cachesize[0] = memsize;
-        if (n > 1)
-          cachesize[1] = l1dcachesize;
-        if (n > 2)
-          cachesize[2] = l2cachesize;
-      }
-
-      hwloc_debug("%s", "caches");
-      for (i = 0; i < n && cacheconfig[i]; i++)
-        hwloc_debug(" %"PRIu64"(%"PRIu64"kB)", cacheconfig[i], cachesize[i] / 1024);
-
-      /* Now we know how many caches there are */
-      n = i;
-      hwloc_debug("\n%u cache levels\n", n - 1);
-
-      /* For each cache level (0 is memory) */
-      for (i = 0; i < n; i++) {
-        /* cacheconfig tells us how many cpus share it, let's iterate on each cache */
-        for (j = 0; j < (nprocs / cacheconfig[i]); j++) {
-          obj = hwloc_alloc_setup_object(i?HWLOC_OBJ_CACHE:HWLOC_OBJ_NUMANODE, j);
-          if (!i) {
-            obj->nodeset = hwloc_bitmap_alloc();
-            hwloc_bitmap_set(obj->nodeset, j);
-          }
-          obj->cpuset = hwloc_bitmap_alloc();
-          for (cpu = j*cacheconfig[i];
-               cpu < ((j+1)*cacheconfig[i]);
-               cpu++)
-            hwloc_bitmap_set(obj->cpuset, cpu);
-
-          if (i == 1 && l1icachesize) {
-            /* FIXME assuming that L1i and L1d are shared the same way. Darwin
-             * does not yet provide a way to know.  */
-            hwloc_obj_t l1i = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, j);
-            l1i->cpuset = hwloc_bitmap_dup(obj->cpuset);
-            hwloc_debug_1arg_bitmap("L1icache %u has cpuset %s\n",
-                j, l1i->cpuset);
-            l1i->attr->cache.depth = i;
-            l1i->attr->cache.size = l1icachesize;
-            l1i->attr->cache.linesize = cachelinesize;
-            l1i->attr->cache.associativity = 0;
-            l1i->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
-
-            hwloc_insert_object_by_cpuset(topology, l1i);
-          }
-          if (i) {
-            hwloc_debug_2args_bitmap("L%ucache %u has cpuset %s\n",
-                i, j, obj->cpuset);
-            obj->attr->cache.depth = i;
-            obj->attr->cache.size = cachesize[i];
-            obj->attr->cache.linesize = cachelinesize;
-            if (i <= sizeof(cacheways) / sizeof(cacheways[0]))
-              obj->attr->cache.associativity = cacheways[i-1];
-            else
-              obj->attr->cache.associativity = 0;
-            if (i == 1 && l1icachesize)
-              obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
-            else
-              obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
-          } else {
-            hwloc_debug_1arg_bitmap("node %u has cpuset %s\n",
-                j, obj->cpuset);
-	    obj->memory.local_memory = cachesize[i];
-	    obj->memory.page_types_len = 2;
-	    obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
-	    memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
-	    obj->memory.page_types[0].size = hwloc_getpagesize();
-#ifdef HAVE__SC_LARGE_PAGESIZE
-	    obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
-#endif
-          }
-
-          hwloc_insert_object_by_cpuset(topology, obj);
-        }
-      }
-    }
-  out:
-    if (NULL != cacheconfig) {
-        free(cacheconfig);
-    }
-    if (NULL != cachesize) {
-        free(cachesize);
-    }
-    if (NULL != cacheconfig32) {
-        free(cacheconfig32);
-    }
-  }
-
-
-  /* add PU objects */
-  hwloc_setup_pu_level(topology, nprocs);
-
-  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Darwin");
-  if (topology->is_thissystem)
-    hwloc_add_uname_info(topology, NULL);
-  return 1;
-}
-
-void
-hwloc_set_darwin_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
-		       struct hwloc_topology_support *support __hwloc_attribute_unused)
-{
-}
-
-static struct hwloc_backend *
-hwloc_darwin_component_instantiate(struct hwloc_disc_component *component,
-				   const void *_data1 __hwloc_attribute_unused,
-				   const void *_data2 __hwloc_attribute_unused,
-				   const void *_data3 __hwloc_attribute_unused)
-{
-  struct hwloc_backend *backend;
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-  backend->discover = hwloc_look_darwin;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_darwin_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
-  "darwin",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  hwloc_darwin_component_instantiate,
-  50,
-  NULL
-};
-
-const struct hwloc_component hwloc_darwin_component = {
-  HWLOC_COMPONENT_ABI,
-  NULL, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_darwin_disc_component
-};
diff --git a/ext/hwloc/hwloc/topology-fake.c b/ext/hwloc/hwloc/topology-fake.c
index e3e22a0a1..ba50cbfcf 100644
--- a/ext/hwloc/hwloc/topology-fake.c
+++ b/ext/hwloc/hwloc/topology-fake.c
@@ -1,31 +1,70 @@
 /*
- * Copyright © 2012-2014 Inria.  All rights reserved.
+ * Copyright © 2012-2019 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
 
 #include <stdlib.h>
 
+static int
+hwloc_look_fake(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
+{
+  hwloc_topology_t topology = backend->topology;
+
+  assert(dstatus->phase == HWLOC_DISC_PHASE_TWEAK);
+
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT_TWEAK")) {
+    hwloc_obj_t obj;
+    int err;
+    /* restrict to single (last) PU */
+    obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU)-1);
+    assert(obj);
+    err = hwloc_topology_restrict(topology, obj->cpuset, 0);
+    assert(!err);
+    /* restrict to single (first) NUMA node */
+    obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, 0);
+    assert(obj);
+    err = hwloc_topology_restrict(topology, obj->nodeset, HWLOC_RESTRICT_FLAG_BYNODESET);
+    assert(!err);
+  }
+
+  return 0;
+}
+
 static struct hwloc_backend *
-hwloc_fake_component_instantiate(struct hwloc_disc_component *component __hwloc_attribute_unused,
+hwloc_fake_component_instantiate(struct hwloc_topology *topology __hwloc_attribute_unused,
+				 struct hwloc_disc_component *component __hwloc_attribute_unused,
+				 unsigned excluded_phases __hwloc_attribute_unused,
 				 const void *_data1 __hwloc_attribute_unused,
 				 const void *_data2 __hwloc_attribute_unused,
 				 const void *_data3 __hwloc_attribute_unused)
 {
+  struct hwloc_backend *backend;
+
+  backend = hwloc_backend_alloc(topology, component);
+  if (!backend)
+    goto out;
+  backend->discover = hwloc_look_fake;
+
   if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
     printf("fake component instantiated\n");
+
+  return backend;
+
+ out:
   return NULL;
 }
 
 static struct hwloc_disc_component hwloc_fake_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_MISC, /* so that it's always enabled when using the OS discovery */
   "fake",
+  HWLOC_DISC_PHASE_TWEAK,
   0, /* nothing to exclude */
   hwloc_fake_component_instantiate,
   100, /* make sure it's loaded before anything conflicting excludes it */
+  1,
   NULL
 };
 
diff --git a/ext/hwloc/hwloc/topology-freebsd.cb b/ext/hwloc/hwloc/topology-freebsd.cb
deleted file mode 100644
index d8d4c54a9..000000000
--- a/ext/hwloc/hwloc/topology-freebsd.cb
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
- * Copyright © 2009-2010, 2012 Université Bordeaux
- * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
- * See COPYING in top-level directory.
- */
-
-#include <private/autogen/config.h>
-
-#include <sys/types.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <sys/param.h>
-#include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
-#include <pthread_np.h>
-#endif
-#ifdef HAVE_SYS_CPUSET_H
-#include <sys/cpuset.h>
-#endif
-#ifdef HAVE_SYS_SYSCTL_H
-#include <sys/sysctl.h>
-#endif
-
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-
-#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
-static void
-hwloc_freebsd_bsd2hwloc(hwloc_bitmap_t hwloc_cpuset, const cpuset_t *cset)
-{
-  unsigned cpu;
-  hwloc_bitmap_zero(hwloc_cpuset);
-  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
-    if (CPU_ISSET(cpu, cset))
-      hwloc_bitmap_set(hwloc_cpuset, cpu);
-}
-
-static void
-hwloc_freebsd_hwloc2bsd(hwloc_const_bitmap_t hwloc_cpuset, cpuset_t *cset)
-{
-  unsigned cpu;
-  CPU_ZERO(cset);
-  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
-    if (hwloc_bitmap_isset(hwloc_cpuset, cpu))
-      CPU_SET(cpu, cset);
-}
-
-static int
-hwloc_freebsd_set_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
-{
-  cpuset_t cset;
-
-  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
-
-  if (cpuset_setaffinity(level, which, id, sizeof(cset), &cset))
-    return -1;
-
-  return 0;
-}
-
-static int
-hwloc_freebsd_get_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
-{
-  cpuset_t cset;
-
-  if (cpuset_getaffinity(level, which, id, sizeof(cset), &cset))
-    return -1;
-
-  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
-  return 0;
-}
-
-static int
-hwloc_freebsd_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
-}
-
-static int
-hwloc_freebsd_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
-}
-
-static int
-hwloc_freebsd_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
-}
-
-static int
-hwloc_freebsd_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
-}
-
-static int
-hwloc_freebsd_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
-}
-
-static int
-hwloc_freebsd_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t hwloc_cpuset, int flags)
-{
-  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
-}
-
-#ifdef hwloc_thread_t
-
-#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
-#pragma weak pthread_setaffinity_np
-static int
-hwloc_freebsd_set_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
-{
-  int err;
-  cpuset_t cset;
-
-  if (!pthread_setaffinity_np) {
-    errno = ENOSYS;
-    return -1;
-  }
-
-  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
-
-  err = pthread_setaffinity_np(tid, sizeof(cset), &cset);
-
-  if (err) {
-    errno = err;
-    return -1;
-  }
-
-  return 0;
-}
-#endif
-
-#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
-#pragma weak pthread_getaffinity_np
-static int
-hwloc_freebsd_get_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
-{
-  int err;
-  cpuset_t cset;
-
-  if (!pthread_getaffinity_np) {
-    errno = ENOSYS;
-    return -1;
-  }
-
-  err = pthread_getaffinity_np(tid, sizeof(cset), &cset);
-
-  if (err) {
-    errno = err;
-    return -1;
-  }
-
-  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
-  return 0;
-}
-#endif
-#endif
-#endif
-
-#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
-static void
-hwloc_freebsd_node_meminfo_info(struct hwloc_topology *topology)
-{
-       int mib[2] = { CTL_HW, HW_PHYSMEM };
-       unsigned long physmem;
-       size_t len = sizeof(physmem);
-       sysctl(mib, 2, &physmem, &len, NULL, 0);
-       topology->levels[0][0]->memory.local_memory = physmem;
-       /* we don't know anything about NUMA nodes in this backend.
-        * let another backend or the core move that memory to the right NUMA node */
-}
-#endif
-
-static int
-hwloc_look_freebsd(struct hwloc_backend *backend)
-{
-  struct hwloc_topology *topology = backend->topology;
-  unsigned nbprocs = hwloc_fallback_nbprocessors(topology);
-
-  if (!topology->levels[0][0]->cpuset) {
-    /* Nobody (even the x86 backend) created objects yet, setup basic objects */
-    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-    hwloc_setup_pu_level(topology, nbprocs);
-  }
-
-  /* Add FreeBSD specific information */
-#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
-  hwloc_freebsd_node_meminfo_info(topology);
-#endif
-  hwloc_obj_add_info(topology->levels[0][0], "Backend", "FreeBSD");
-  if (topology->is_thissystem)
-    hwloc_add_uname_info(topology, NULL);
-  return 1;
-}
-
-void
-hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
-			struct hwloc_topology_support *support __hwloc_attribute_unused)
-{
-#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
-  hooks->set_thisproc_cpubind = hwloc_freebsd_set_thisproc_cpubind;
-  hooks->get_thisproc_cpubind = hwloc_freebsd_get_thisproc_cpubind;
-  hooks->set_thisthread_cpubind = hwloc_freebsd_set_thisthread_cpubind;
-  hooks->get_thisthread_cpubind = hwloc_freebsd_get_thisthread_cpubind;
-  hooks->set_proc_cpubind = hwloc_freebsd_set_proc_cpubind;
-  hooks->get_proc_cpubind = hwloc_freebsd_get_proc_cpubind;
-#ifdef hwloc_thread_t
-#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
-  hooks->set_thread_cpubind = hwloc_freebsd_set_thread_cpubind;
-#endif
-#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
-  hooks->get_thread_cpubind = hwloc_freebsd_get_thread_cpubind;
-#endif
-#endif
-#endif
-  /* TODO: get_last_cpu_location: find out ki_lastcpu */
-}
-
-static struct hwloc_backend *
-hwloc_freebsd_component_instantiate(struct hwloc_disc_component *component,
-				    const void *_data1 __hwloc_attribute_unused,
-				    const void *_data2 __hwloc_attribute_unused,
-				    const void *_data3 __hwloc_attribute_unused)
-{
-  struct hwloc_backend *backend;
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-  backend->discover = hwloc_look_freebsd;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_freebsd_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
-  "freebsd",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  hwloc_freebsd_component_instantiate,
-  50,
-  NULL
-};
-
-const struct hwloc_component hwloc_freebsd_component = {
-  HWLOC_COMPONENT_ABI,
-  NULL, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_freebsd_disc_component
-};
diff --git a/ext/hwloc/hwloc/topology-hardwired.c b/ext/hwloc/hwloc/topology-hardwired.c
new file mode 100644
index 000000000..ee06781d0
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-hardwired.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright © 2015-2017 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+
+int hwloc_look_hardwired_fujitsu_k(struct hwloc_topology *topology)
+{
+  /* If a broken core gets disabled, its bit disappears and other core bits are NOT shifted towards 0.
+   * Node is not given to user job, not need to handle that case properly.
+   */
+  unsigned i;
+  hwloc_obj_t obj;
+  hwloc_bitmap_t set;
+
+  for(i=0; i<8; i++) {
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(set, i);
+
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1ICACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1ICACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 32*1024;
+      obj->attr->cache.linesize = 128;
+      obj->attr->cache.associativity = 2;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1CACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1CACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 32*1024;
+      obj->attr->cache.linesize = 128;
+      obj->attr->cache.associativity = 2;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, i);
+      obj->cpuset = set;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    } else
+      hwloc_bitmap_free(set);
+  }
+
+  set = hwloc_bitmap_alloc();
+  hwloc_bitmap_set_range(set, 0, 7);
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L2CACHE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L2CACHE, HWLOC_UNKNOWN_INDEX);
+    obj->cpuset = hwloc_bitmap_dup(set);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 6*1024*1024;
+    obj->attr->cache.linesize = 128;
+    obj->attr->cache.associativity = 12;
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, 0);
+    obj->cpuset = set;
+    hwloc_obj_add_info(obj, "CPUVendor", "Fujitsu");
+    hwloc_obj_add_info(obj, "CPUModel", "SPARC64 VIIIfx");
+    hwloc_insert_object_by_cpuset(topology, obj);
+  } else
+    hwloc_bitmap_free(set);
+
+  topology->support.discovery->pu = 1;
+  hwloc_setup_pu_level(topology, 8);
+
+  return 0;
+}
+
+int hwloc_look_hardwired_fujitsu_fx10(struct hwloc_topology *topology)
+{
+  /* If a broken core gets disabled, its bit disappears and other core bits are NOT shifted towards 0.
+   * Node is not given to user job, not need to handle that case properly.
+   */
+  unsigned i;
+  hwloc_obj_t obj;
+  hwloc_bitmap_t set;
+
+  for(i=0; i<16; i++) {
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(set, i);
+
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1ICACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1ICACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 32*1024;
+      obj->attr->cache.linesize = 128;
+      obj->attr->cache.associativity = 2;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1CACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1CACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 32*1024;
+      obj->attr->cache.linesize = 128;
+      obj->attr->cache.associativity = 2;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, i);
+      obj->cpuset = set;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    } else
+      hwloc_bitmap_free(set);
+  }
+
+  set = hwloc_bitmap_alloc();
+  hwloc_bitmap_set_range(set, 0, 15);
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L2CACHE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L2CACHE, HWLOC_UNKNOWN_INDEX);
+    obj->cpuset = hwloc_bitmap_dup(set);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 12*1024*1024;
+    obj->attr->cache.linesize = 128;
+    obj->attr->cache.associativity = 24;
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, 0);
+    obj->cpuset = set;
+    hwloc_obj_add_info(obj, "CPUVendor", "Fujitsu");
+    hwloc_obj_add_info(obj, "CPUModel", "SPARC64 IXfx");
+    hwloc_insert_object_by_cpuset(topology, obj);
+  } else
+    hwloc_bitmap_free(set);
+
+  topology->support.discovery->pu = 1;
+  hwloc_setup_pu_level(topology, 16);
+
+  return 0;
+}
+
+int hwloc_look_hardwired_fujitsu_fx100(struct hwloc_topology *topology)
+{
+  /* If a broken core gets disabled, its bit disappears and other core bits are NOT shifted towards 0.
+   * Node is not given to user job, not need to handle that case properly.
+   */
+  unsigned i;
+  hwloc_obj_t obj;
+  hwloc_bitmap_t set;
+
+  for(i=0; i<34; i++) {
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(set, i);
+
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1ICACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1ICACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 64*1024;
+      obj->attr->cache.linesize = 256;
+      obj->attr->cache.associativity = 4;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L1CACHE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L1CACHE, HWLOC_UNKNOWN_INDEX);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 64*1024;
+      obj->attr->cache.linesize = 256;
+      obj->attr->cache.associativity = 4;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, i);
+      obj->cpuset = set;
+      hwloc_insert_object_by_cpuset(topology, obj);
+    } else
+      hwloc_bitmap_free(set);
+  }
+
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L2CACHE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L2CACHE, HWLOC_UNKNOWN_INDEX);
+    obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(obj->cpuset, 0, 15);
+    hwloc_bitmap_set(obj->cpuset, 32);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 12*1024*1024;
+    obj->attr->cache.linesize = 256;
+    obj->attr->cache.associativity = 24;
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L2CACHE, HWLOC_UNKNOWN_INDEX);
+    obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(obj->cpuset, 16, 31);
+    hwloc_bitmap_set(obj->cpuset, 33);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 12*1024*1024;
+    obj->attr->cache.linesize = 256;
+    obj->attr->cache.associativity = 24;
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, 0);
+    obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(obj->cpuset, 0, 33);
+    hwloc_obj_add_info(obj, "CPUVendor", "Fujitsu");
+    hwloc_obj_add_info(obj, "CPUModel", "SPARC64 XIfx");
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+
+  topology->support.discovery->pu = 1;
+  hwloc_setup_pu_level(topology, 34);
+
+  return 0;
+}
diff --git a/ext/hwloc/hwloc/topology-linux.c b/ext/hwloc/hwloc/topology-linux.c
index 7b3f8e113..b2195f685 100644
--- a/ext/hwloc/hwloc/topology-linux.c
+++ b/ext/hwloc/hwloc/topology-linux.c
@@ -1,20 +1,20 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
- * Copyright © 2009-2013 Université Bordeaux
- * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2013, 2015 Université Bordeaux
+ * Copyright © 2009-2018 Cisco Systems, Inc.  All rights reserved.
  * Copyright © 2015 Intel, Inc.  All rights reserved.
  * Copyright © 2010 IBM
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <hwloc/linux.h>
-#include <private/misc.h>
-#include <private/private.h>
-#include <private/misc.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "hwloc/linux.h"
+#include "private/misc.h"
+#include "private/private.h"
+#include "private/misc.h"
+#include "private/debug.h"
 
 #include <limits.h>
 #include <stdio.h>
@@ -27,7 +27,7 @@
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
-#ifdef HAVE_LIBUDEV_H
+#ifdef HWLOC_HAVE_LIBUDEV
 #include <libudev.h>
 #endif
 #include <sys/types.h>
@@ -36,31 +36,33 @@
 #include <pthread.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
-#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
-#define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
-#include <numaif.h>
-#endif
+#include <mntent.h>
 
 struct hwloc_linux_backend_data_s {
+  char *root_path; /* NULL if unused */
   int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
   int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
-#ifdef HAVE_LIBUDEV_H
+#ifdef HWLOC_HAVE_LIBUDEV
   struct udev *udev; /* Global udev context */
 #endif
-  int is_amd_with_CU;
-  int is_knl;
+  char *dumped_hwdata_dirname;
   enum {
     HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
     HWLOC_LINUX_ARCH_IA64,
     HWLOC_LINUX_ARCH_ARM,
     HWLOC_LINUX_ARCH_POWER,
+    HWLOC_LINUX_ARCH_S390,
     HWLOC_LINUX_ARCH_UNKNOWN
-} arch;
+  } arch;
+  int is_knl;
+  int is_amd_with_CU;
+  int use_dt;
+  int use_numa_distances;
+  int use_numa_distances_for_cpuless;
+  int use_numa_initiators;
   struct utsname utsname; /* fields contain \0 when unknown */
-
-  int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
-  int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
-  unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
+  int fallback_nbprocessors; /* only used in hwloc_linux_fallback_pu_level(), maybe be <= 0 (error) earlier */
+  unsigned pagesize;
 };
 
 
@@ -69,10 +71,9 @@ struct hwloc_linux_backend_data_s {
  * Misc Abstraction layers *
  ***************************/
 
-#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE__SYSCALL3)
-/* libc doesn't have support for sched_setaffinity, build system call
+#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
+/* libc doesn't have support for sched_setaffinity, make system call
  * ourselves: */
-#    include <linux/unistd.h>
 #    ifndef __NR_sched_setaffinity
 #       ifdef __i386__
 #         define __NR_sched_setaffinity 241
@@ -92,6 +93,8 @@ struct hwloc_linux_backend_data_s {
 #         define __NR_sched_setaffinity 311
 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 #         define __NR_sched_setaffinity 222
+#       elif defined(__aarch64__)
+#         define __NR_sched_setaffinity 122
 #       elif defined(__arm__)
 #         define __NR_sched_setaffinity 241
 #       elif defined(__cris__)
@@ -104,7 +107,7 @@ struct hwloc_linux_backend_data_s {
 #       endif
 #    endif
 #    ifndef sched_setaffinity
-       _syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, lg, const void *, mask)
+#      define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
 #    endif
 #    ifndef __NR_sched_getaffinity
 #       ifdef __i386__
@@ -125,6 +128,8 @@ struct hwloc_linux_backend_data_s {
 #         define __NR_sched_getaffinity 312
 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
 #         define __NR_sched_getaffinity 223
+#       elif defined(__aarch64__)
+#         define __NR_sched_getaffinity 123
 #       elif defined(__arm__)
 #         define __NR_sched_getaffinity 242
 #       elif defined(__cris__)
@@ -137,10 +142,201 @@ struct hwloc_linux_backend_data_s {
 #       endif
 #    endif
 #    ifndef sched_getaffinity
-       _syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, lg, void *, mask)
+#      define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
 #    endif
 #endif
 
+/* numa syscalls are only in libnuma, but libnuma devel headers aren't widely installed.
+ * just redefine these syscalls to avoid requiring libnuma devel headers just because of these missing syscalls.
+ * __NR_foo should be defined in headers in all modern platforms.
+ * Just redefine the basic ones on important platform when not to hard to detect/define.
+ */
+
+#ifndef MPOL_DEFAULT
+# define MPOL_DEFAULT 0
+#endif
+#ifndef MPOL_PREFERRED
+# define MPOL_PREFERRED 1
+#endif
+#ifndef MPOL_BIND
+# define MPOL_BIND 2
+#endif
+#ifndef MPOL_INTERLEAVE
+# define MPOL_INTERLEAVE 3
+#endif
+#ifndef MPOL_LOCAL
+# define MPOL_LOCAL 4
+#endif
+#ifndef MPOL_F_ADDR
+# define  MPOL_F_ADDR (1<<1)
+#endif
+#ifndef MPOL_MF_STRICT
+# define MPOL_MF_STRICT (1<<0)
+#endif
+#ifndef MPOL_MF_MOVE
+# define MPOL_MF_MOVE (1<<1)
+#endif
+
+#ifndef __NR_mbind
+# ifdef __i386__
+#  define __NR_mbind 274
+# elif defined(__x86_64__)
+#  define __NR_mbind 237
+# elif defined(__ia64__)
+#  define __NR_mbind 1259
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#  define __NR_mbind 259
+# elif defined(__sparc__)
+#  define __NR_mbind 353
+# elif defined(__aarch64__)
+#  define __NR_mbind 235
+# elif defined(__arm__)
+#  define __NR_mbind 319
+# endif
+#endif
+static __hwloc_inline long hwloc_mbind(void *addr __hwloc_attribute_unused,
+				       unsigned long len __hwloc_attribute_unused,
+				       int mode __hwloc_attribute_unused,
+				       const unsigned long *nodemask __hwloc_attribute_unused,
+				       unsigned long maxnode __hwloc_attribute_unused,
+				       unsigned flags __hwloc_attribute_unused)
+{
+#if (defined __NR_mbind) && (defined HWLOC_HAVE_SYSCALL)
+  return syscall(__NR_mbind, (long) addr, len, mode, (long)nodemask, maxnode, flags);
+#else
+#warning Couldn't find __NR_mbind syscall number, memory binding won't be supported
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
+#ifndef __NR_set_mempolicy
+# ifdef __i386__
+#  define __NR_set_mempolicy 276
+# elif defined(__x86_64__)
+#  define __NR_set_mempolicy 239
+# elif defined(__ia64__)
+#  define __NR_set_mempolicy 1261
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#  define __NR_set_mempolicy 261
+# elif defined(__sparc__)
+#  define __NR_set_mempolicy 305
+# elif defined(__aarch64__)
+#  define __NR_set_mempolicy 237
+# elif defined(__arm__)
+#  define __NR_set_mempolicy 321
+# endif
+#endif
+static __hwloc_inline long hwloc_set_mempolicy(int mode __hwloc_attribute_unused,
+					       const unsigned long *nodemask __hwloc_attribute_unused,
+					       unsigned long maxnode __hwloc_attribute_unused)
+{
+#if (defined __NR_set_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
+  return syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
+#else
+#warning Couldn't find __NR_set_mempolicy syscall number, memory binding won't be supported
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
+#ifndef __NR_get_mempolicy
+# ifdef __i386__
+#  define __NR_get_mempolicy 275
+# elif defined(__x86_64__)
+#  define __NR_get_mempolicy 238
+# elif defined(__ia64__)
+#  define __NR_get_mempolicy 1260
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#  define __NR_get_mempolicy 260
+# elif defined(__sparc__)
+#  define __NR_get_mempolicy 304
+# elif defined(__aarch64__)
+#  define __NR_get_mempolicy 236
+# elif defined(__arm__)
+#  define __NR_get_mempolicy 320
+# endif
+#endif
+static __hwloc_inline long hwloc_get_mempolicy(int *mode __hwloc_attribute_unused,
+					       const unsigned long *nodemask __hwloc_attribute_unused,
+					       unsigned long maxnode __hwloc_attribute_unused,
+					       void *addr __hwloc_attribute_unused,
+					       int flags __hwloc_attribute_unused)
+{
+#if (defined __NR_get_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
+  return syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
+#else
+#warning Couldn't find __NR_get_mempolicy syscall number, memory binding won't be supported
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
+#ifndef __NR_migrate_pages
+# ifdef __i386__
+#  define __NR_migrate_pages 204
+# elif defined(__x86_64__)
+#  define __NR_migrate_pages 256
+# elif defined(__ia64__)
+#  define __NR_migrate_pages 1280
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#  define __NR_migrate_pages 258
+# elif defined(__sparc__)
+#  define __NR_migrate_pages 302
+# elif defined(__aarch64__)
+#  define __NR_migrate_pages 238
+# elif defined(__arm__)
+#  define __NR_migrate_pages 400
+# endif
+#endif
+static __hwloc_inline long hwloc_migrate_pages(int pid __hwloc_attribute_unused,
+					       unsigned long maxnode __hwloc_attribute_unused,
+					       const unsigned long *oldnodes __hwloc_attribute_unused,
+					       const unsigned long *newnodes __hwloc_attribute_unused)
+{
+#if (defined __NR_migrate_pages) && (defined HWLOC_HAVE_SYSCALL)
+  return syscall(__NR_migrate_pages, pid, maxnode, oldnodes, newnodes);
+#else
+#warning Couldn't find __NR_migrate_pages syscall number, memory migration won't be supported
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
+#ifndef __NR_move_pages
+# ifdef __i386__
+#  define __NR_move_pages 317
+# elif defined(__x86_64__)
+#  define __NR_move_pages 279
+# elif defined(__ia64__)
+#  define __NR_move_pages 1276
+# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#  define __NR_move_pages 301
+# elif defined(__sparc__)
+#  define __NR_move_pages 307
+# elif defined(__aarch64__)
+#  define __NR_move_pages 239
+# elif defined(__arm__)
+#  define __NR_move_pages 344
+# endif
+#endif
+static __hwloc_inline long hwloc_move_pages(int pid __hwloc_attribute_unused,
+					    unsigned long count __hwloc_attribute_unused,
+					    void **pages __hwloc_attribute_unused,
+					    const int *nodes __hwloc_attribute_unused,
+					    int *status __hwloc_attribute_unused,
+					    int flags __hwloc_attribute_unused)
+{
+#if (defined __NR_move_pages) && (defined HWLOC_HAVE_SYSCALL)
+  return syscall(__NR_move_pages, pid, count, pages, nodes, status, flags);
+#else
+#warning Couldn't find __NR_move_pages syscall number, getting memory location won't be supported
+  errno = ENOSYS;
+  return -1;
+#endif
+}
+
+
 /* Added for ntohl() */
 #include <arpa/inet.h>
 
@@ -150,14 +346,11 @@ struct hwloc_linux_backend_data_s {
 static const char *
 hwloc_checkat(const char *path, int fsroot_fd)
 {
-  const char *relative_path;
-  if (fsroot_fd < 0) {
-    errno = EBADF;
-    return NULL;
-  }
+  const char *relative_path = path;
 
-  /* Skip leading slashes.  */
-  for (relative_path = path; *relative_path == '/'; relative_path++);
+  if (fsroot_fd >= 0)
+    /* Skip leading slashes.  */
+    for (; *relative_path == '/'; relative_path++);
 
   return relative_path;
 }
@@ -232,6 +425,18 @@ hwloc_opendirat(const char *path, int fsroot_fd)
   return fdopendir(dir_fd);
 }
 
+static int
+hwloc_readlinkat(const char *path, char *buf, size_t buflen, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return readlinkat(fsroot_fd, relative_path, buf, buflen);
+}
+
 #endif /* HAVE_OPENAT */
 
 /* Static inline version of fopen so that we can use openat if we have
@@ -300,180 +505,493 @@ hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
 #endif
 }
 
+static __hwloc_inline int
+hwloc_readlink(const char *p, char *l, size_t ll, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+  return hwloc_readlinkat(p, l, ll, d);
+#else
+  return readlink(p, l, ll);
+#endif
+}
 
-/*****************************
- ******* CpuBind Hooks *******
- *****************************/
 
-int
-hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
-{
-  /* The resulting binding is always strict */
+/*****************************************
+ ******* Helpers for reading files *******
+ *****************************************/
 
-#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
-  cpu_set_t *plinux_set;
-  unsigned cpu;
-  int last;
-  size_t setsize;
-  int err;
+static __hwloc_inline int
+hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
+{
+  int fd, ret;
 
-  last = hwloc_bitmap_last(hwloc_set);
-  if (last == -1) {
-    errno = EINVAL;
+  fd = hwloc_open(path, fsroot_fd);
+  if (fd < 0)
     return -1;
-  }
 
-  setsize = CPU_ALLOC_SIZE(last+1);
-  plinux_set = CPU_ALLOC(last+1);
+  ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
+  close(fd);
 
-  CPU_ZERO_S(setsize, plinux_set);
-  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
-    CPU_SET_S(cpu, setsize, plinux_set);
-  hwloc_bitmap_foreach_end();
+  if (ret <= 0)
+    return -1;
 
-  err = sched_setaffinity(tid, setsize, plinux_set);
+  string[ret] = 0;
 
-  CPU_FREE(plinux_set);
-  return err;
-#elif defined(HWLOC_HAVE_CPU_SET)
-  cpu_set_t linux_set;
-  unsigned cpu;
+  return 0;
+}
 
-  CPU_ZERO(&linux_set);
-  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
-    CPU_SET(cpu, &linux_set);
-  hwloc_bitmap_foreach_end();
+static __hwloc_inline int
+hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
+{
+  char string[11];
+  if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
+    return -1;
+  *value = atoi(string);
+  return 0;
+}
 
-#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
-  return sched_setaffinity(tid, &linux_set);
-#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-  return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
-#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-#elif defined(HWLOC_HAVE__SYSCALL3)
-  unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+static __hwloc_inline int
+hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
+{
+  char string[11];
+  if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
+    return -1;
+  *value = (unsigned) strtoul(string, NULL, 10);
+  return 0;
+}
 
-#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
-  return sched_setaffinity(tid, (void*) &mask);
-#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-  return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
-#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-#else /* !_SYSCALL3 */
-  errno = ENOSYS;
-  return -1;
-#endif /* !_SYSCALL3 */
+static __hwloc_inline int
+hwloc_read_path_as_uint64(const char *path, uint64_t *value, int fsroot_fd)
+{
+  char string[22];
+  if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
+    return -1;
+  *value = (uint64_t) strtoull(string, NULL, 10);
+  return 0;
 }
 
-#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
-static int
-hwloc_linux_parse_cpuset_file(FILE *file, hwloc_bitmap_t set)
+/* Read everything from fd and save it into a newly allocated buffer
+ * returned in bufferp. Use sizep as a default buffer size, and returned
+ * the actually needed size in sizep.
+ */
+static __hwloc_inline int
+hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
 {
-  unsigned long start, stop;
+  char *buffer;
+  size_t toread, filesize, totalread;
+  ssize_t ret;
 
-  /* reset to zero first */
-  hwloc_bitmap_zero(set);
+  toread = filesize = *sizep;
 
-  while (fscanf(file, "%lu", &start) == 1)
-  {
-    int c = fgetc(file);
+  /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
+  buffer = malloc(filesize+1);
+  if (!buffer)
+    return -1;
 
-    stop = start;
+  ret = read(fd, buffer, toread+1);
+  if (ret < 0) {
+    free(buffer);
+    return -1;
+  }
 
-    if (c == '-') {
-      /* Range */
-      if (fscanf(file, "%lu", &stop) != 1) {
-        /* Expected a number here */
-        errno = EINVAL;
-        return -1;
-      }
-      c = fgetc(file);
-    }
+  totalread = (size_t) ret;
 
-    if (c == EOF || c == '\n') {
-      hwloc_bitmap_set_range(set, start, stop);
-      break;
+  if (totalread < toread + 1)
+    /* Normal case, a single read got EOF */
+    goto done;
+
+  /* Unexpected case, must extend the buffer and read again.
+   * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
+   */
+  do {
+    char *tmp;
+
+    toread = filesize;
+    filesize *= 2;
+
+    tmp = realloc(buffer, filesize+1);
+    if (!tmp) {
+      free(buffer);
+      return -1;
     }
+    buffer = tmp;
 
-    if (c != ',') {
-      /* Expected EOF, EOL, or a comma */
-      errno = EINVAL;
+    ret = read(fd, buffer+toread+1, toread);
+    if (ret < 0) {
+      free(buffer);
       return -1;
     }
 
-    hwloc_bitmap_set_range(set, start, stop);
-  }
+    totalread += ret;
+  } while ((size_t) ret == toread);
 
+ done:
+  buffer[totalread] = '\0';
+  *bufferp = buffer;
+  *sizep = filesize;
   return 0;
 }
 
-/*
- * On some kernels, sched_getaffinity requires the output size to be larger
- * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
- * Try sched_affinity on ourself until we find a nr_cpus value that makes
- * the kernel happy.
- */
-static int
-hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
-{
-  static int _nr_cpus = -1;
-  int nr_cpus = _nr_cpus;
-  FILE *possible;
+/* kernel cpumaps are composed of an array of 32bits cpumasks */
+#define KERNEL_CPU_MASK_BITS 32
+#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
 
-  if (nr_cpus != -1)
-    /* already computed */
-    return nr_cpus;
+static __hwloc_inline int
+hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
+{
+  static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
+  size_t filesize;
+  unsigned long *maps;
+  unsigned long map;
+  int nr_maps = 0;
+  static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
+				      * Actually, it may increase multiple times if first read cpumaps start with zeroes.
+				      */
+  int nr_maps_allocated = _nr_maps_allocated;
+  char *buffer, *tmpbuf;
+  int i;
 
-  if (topology->levels[0][0]->complete_cpuset)
-    /* start with a nr_cpus that may contain the whole topology */
-    nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
-  if (nr_cpus <= 0)
-    /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
-    nr_cpus = 1;
+  /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
+   * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
+   *
+   * If we ever need a larger buffer, we'll realloc() the buffer during the first
+   * invocation of this function so that others directly allocate the right size
+   * (all cpumask files have the exact same size).
+   */
+  filesize = _filesize;
+  if (!filesize)
+    filesize = hwloc_getpagesize();
+  if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
+    return -1;
+  /* Only update the static value with the final one,
+   * to avoid sharing intermediate values that we modify,
+   * in case there's ever multiple concurrent calls.
+   */
+  _filesize = filesize;
 
-  possible = fopen("/sys/devices/system/cpu/possible", "r");
-  if (possible) {
-    hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
-    if (hwloc_linux_parse_cpuset_file(possible, possible_bitmap) == 0) {
-      int max_possible = hwloc_bitmap_last(possible_bitmap);
+  maps = malloc(nr_maps_allocated * sizeof(*maps));
+  if (!maps) {
+    free(buffer);
+    return -1;
+  }
 
-      hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
+  /* reset to zero first */
+  hwloc_bitmap_zero(set);
 
-      if (nr_cpus < max_possible + 1)
-        nr_cpus = max_possible + 1;
+  /* parse the whole mask */
+  tmpbuf = buffer;
+  while (sscanf(tmpbuf, "%lx", &map) == 1) {
+    /* read one kernel cpu mask and the ending comma */
+    if (nr_maps == nr_maps_allocated) {
+      unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
+      if (!tmp) {
+	free(buffer);
+	free(maps);
+	return -1;
+      }
+      maps = tmp;
+      nr_maps_allocated *= 2;
     }
-    fclose(possible);
-    hwloc_bitmap_free(possible_bitmap);
+
+    tmpbuf = strchr(tmpbuf, ',');
+    if (!tmpbuf) {
+      maps[nr_maps++] = map;
+      break;
+    } else
+      tmpbuf++;
+
+    if (!map && !nr_maps)
+      /* ignore the first map if it's empty */
+      continue;
+
+    maps[nr_maps++] = map;
   }
 
-  while (1) {
-    cpu_set_t *set = CPU_ALLOC(nr_cpus);
-    size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
-    int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
-    CPU_FREE(set);
-    nr_cpus = setsize * 8; /* that's the value that was actually tested */
-    if (!err)
-      /* found it */
-      return _nr_cpus = nr_cpus;
-    nr_cpus *= 2;
+  free(buffer);
+
+  /* convert into a set */
+#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
+  for(i=0; i<nr_maps; i++)
+    hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
+#else
+  for(i=0; i<(nr_maps+1)/2; i++) {
+    unsigned long mask;
+    mask = maps[nr_maps-2*i-1];
+    if (2*i+1<nr_maps)
+      mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
+    hwloc_bitmap_set_ith_ulong(set, i, mask);
   }
-}
 #endif
 
-int
-hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
-{
-  int err __hwloc_attribute_unused;
-
-#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
-  cpu_set_t *plinux_set;
-  unsigned cpu;
-  int last;
-  size_t setsize;
-  int kernel_nr_cpus;
+  free(maps);
 
-  /* find the kernel nr_cpus so as to use a large enough cpu_set size */
-  kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
-  setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
-  plinux_set = CPU_ALLOC(kernel_nr_cpus);
+  /* Only update the static value with the final one,
+   * to avoid sharing intermediate values that we modify,
+   * in case there's ever multiple concurrent calls.
+   */
+  if (nr_maps_allocated > _nr_maps_allocated)
+    _nr_maps_allocated = nr_maps_allocated;
+  return 0;
+}
+
+static __hwloc_inline int
+hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
+{
+  int fd, err;
+  fd = hwloc_open(maskpath, fsroot_fd);
+  if (fd < 0)
+    return -1;
+  err = hwloc__read_fd_as_cpumask(fd, set);
+  close(fd);
+  return err;
+}
+
+static __hwloc_inline hwloc_bitmap_t
+hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
+{
+  hwloc_bitmap_t set;
+  int err;
+  set = hwloc_bitmap_alloc();
+  if (!set)
+    return NULL;
+  err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
+  if (err < 0) {
+    hwloc_bitmap_free(set);
+    return NULL;
+  } else
+    return set;
+}
+
+int
+hwloc_linux_read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set)
+{
+  int fd, err;
+  fd = open(maskpath, O_RDONLY);
+  if (fd < 0)
+    return -1;
+  err = hwloc__read_fd_as_cpumask(fd, set);
+  close(fd);
+  return err;
+}
+
+/* on failure, the content of set is undefined */
+static __hwloc_inline int
+hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
+{
+  /* Kernel sysfs files are usually at most one page.
+   * But cpulists can be of very different sizes depending on the fragmentation,
+   * so don't bother remember the actual read size between invocations.
+   * We don't have many invocations anyway.
+   */
+  size_t filesize = hwloc_getpagesize();
+  char *buffer, *current, *comma, *tmp;
+  int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
+
+  if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
+    return -1;
+
+  hwloc_bitmap_fill(set);
+
+  current = buffer;
+  prevlast = -1;
+
+  while (1) {
+    /* save a pointer to the next comma and erase it to simplify things */
+    comma = strchr(current, ',');
+    if (comma)
+      *comma = '\0';
+
+    /* find current enabled-segment bounds */
+    nextfirst = strtoul(current, &tmp, 0);
+    if (*tmp == '-')
+      nextlast = strtoul(tmp+1, NULL, 0);
+    else
+      nextlast = nextfirst;
+    if (prevlast+1 <= nextfirst-1)
+      hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
+
+    /* switch to next enabled-segment */
+    prevlast = nextlast;
+    if (!comma)
+      break;
+    current = comma+1;
+  }
+
+  hwloc_bitmap_clr_range(set, prevlast+1, -1);
+  free(buffer);
+  return 0;
+}
+
+/* on failure, the content of set is undefined */
+static __hwloc_inline int
+hwloc__read_path_as_cpulist(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
+{
+  int fd, err;
+  fd = hwloc_open(maskpath, fsroot_fd);
+  if (fd < 0)
+    return -1;
+  err = hwloc__read_fd_as_cpulist(fd, set);
+  close(fd);
+  return err;
+}
+
+/* on failure, the content of set is undefined */
+static __hwloc_inline hwloc_bitmap_t
+hwloc__alloc_read_path_as_cpulist(const char *maskpath, int fsroot_fd)
+{
+  hwloc_bitmap_t set;
+  int err;
+  set = hwloc_bitmap_alloc_full();
+  if (!set)
+    return NULL;
+  err = hwloc__read_path_as_cpulist(maskpath, set, fsroot_fd);
+  if (err < 0) {
+    hwloc_bitmap_free(set);
+    return NULL;
+  } else
+    return set;
+}
+
+
+/*****************************
+ ******* CpuBind Hooks *******
+ *****************************/
+
+int
+hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  /* The resulting binding is always strict */
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int err;
+
+  last = hwloc_bitmap_last(hwloc_set);
+  if (last == -1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  setsize = CPU_ALLOC_SIZE(last+1);
+  plinux_set = CPU_ALLOC(last+1);
+
+  CPU_ZERO_S(setsize, plinux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET_S(cpu, setsize, plinux_set);
+  hwloc_bitmap_foreach_end();
+
+  err = sched_setaffinity(tid, setsize, plinux_set);
+
+  CPU_FREE(plinux_set);
+  return err;
+#elif defined(HWLOC_HAVE_CPU_SET)
+  cpu_set_t linux_set;
+  unsigned cpu;
+
+  CPU_ZERO(&linux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET(cpu, &linux_set);
+  hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#elif defined(HWLOC_HAVE_SYSCALL)
+  unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#else /* !SYSCALL */
+  errno = ENOSYS;
+  return -1;
+#endif /* !SYSCALL */
+}
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+/*
+ * On some kernels, sched_getaffinity requires the output size to be larger
+ * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
+ * Try sched_affinity on ourself until we find a nr_cpus value that makes
+ * the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
+{
+  static int _nr_cpus = -1;
+  int nr_cpus = _nr_cpus;
+  int fd;
+
+  if (nr_cpus != -1)
+    /* already computed */
+    return nr_cpus;
+
+  if (topology->levels[0][0]->complete_cpuset)
+    /* start with a nr_cpus that may contain the whole topology */
+    nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
+  if (nr_cpus <= 0)
+    /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
+    nr_cpus = 1;
+
+  /* reading /sys/devices/system/cpu/kernel_max would be easier (single value to parse instead of a list),
+   * but its value may be way too large (5119 on CentOS7).
+   * /sys/devices/system/cpu/possible is better because it matches the current hardware.
+   */
+
+  fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
+  if (fd >= 0) {
+    hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
+    if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
+      int max_possible = hwloc_bitmap_last(possible_bitmap);
+      hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
+
+      if (nr_cpus < max_possible + 1)
+        nr_cpus = max_possible + 1;
+    }
+    close(fd);
+    hwloc_bitmap_free(possible_bitmap);
+  }
+
+  while (1) {
+    cpu_set_t *set = CPU_ALLOC(nr_cpus);
+    size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
+    int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
+    CPU_FREE(set);
+    nr_cpus = setsize * 8; /* that's the value that was actually tested */
+    if (!err)
+      /* Found it. Only update the static value with the final one,
+       * to avoid sharing intermediate values that we modify,
+       * in case there's ever multiple concurrent calls.
+       */
+      return _nr_cpus = nr_cpus;
+    nr_cpus *= 2;
+  }
+}
+#endif
+
+int
+hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  int err __hwloc_attribute_unused;
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int kernel_nr_cpus;
+
+  /* find the kernel nr_cpus so as to use a large enough cpu_set size */
+  kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
+  setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
+  plinux_set = CPU_ALLOC(kernel_nr_cpus);
 
   err = sched_getaffinity(tid, setsize, plinux_set);
 
@@ -511,7 +1029,7 @@ hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
   for(cpu=0; cpu<CPU_SETSIZE; cpu++)
     if (CPU_ISSET(cpu, &linux_set))
       hwloc_bitmap_set(hwloc_set, cpu);
-#elif defined(HWLOC_HAVE__SYSCALL3)
+#elif defined(HWLOC_HAVE_SYSCALL)
   unsigned long mask;
 
 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
@@ -523,10 +1041,10 @@ hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
     return -1;
 
   hwloc_bitmap_from_ulong(hwloc_set, mask);
-#else /* !_SYSCALL3 */
+#else /* !SYSCALL */
   errno = ENOSYS;
   return -1;
-#endif /* !_SYSCALL3 */
+#endif /* !SYSCALL */
 
   return 0;
 }
@@ -990,8 +1508,12 @@ hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribut
   char buf[1024] = "";
   char name[64];
   char *tmp;
-  FILE *file;
-  int i;
+  int fd, i, err;
+
+  /* TODO: find a way to use sched_getcpu().
+   * either compare tid with gettid() in all callbacks.
+   * or pass gettid() in the callback data.
+   */
 
   if (!tid) {
 #ifdef SYS_gettid
@@ -1003,17 +1525,18 @@ hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribut
   }
 
   snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
-  file = fopen(name, "r");
-  if (!file) {
+  fd = open(name, O_RDONLY); /* no fsroot for real /proc */
+  if (fd < 0) {
     errno = ENOSYS;
     return -1;
   }
-  tmp = fgets(buf, sizeof(buf), file);
-  fclose(file);
-  if (!tmp) {
+  err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
+  close(fd);
+  if (err <= 0) {
     errno = ENOSYS;
     return -1;
   }
+  buf[err-1] = '\0';
 
   tmp = strrchr(buf, ')');
   if (!tmp) {
@@ -1108,6 +1631,17 @@ hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bi
     errno = ENOSYS;
     return -1;
   }
+
+#if HAVE_DECL_SCHED_GETCPU
+  {
+    int pu = sched_getcpu();
+    if (pu >= 0) {
+      hwloc_bitmap_only(hwloc_set, pu);
+      return 0;
+    }
+  }
+#endif
+
   return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
 }
 
@@ -1117,15 +1651,16 @@ hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bi
  ****** Membind hooks ******
  ***************************/
 
-#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
 static int
 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
 {
   switch (policy) {
   case HWLOC_MEMBIND_DEFAULT:
-  case HWLOC_MEMBIND_FIRSTTOUCH:
     *linuxpolicy = MPOL_DEFAULT;
     break;
+  case HWLOC_MEMBIND_FIRSTTOUCH:
+    *linuxpolicy = MPOL_LOCAL;
+    break;
   case HWLOC_MEMBIND_BIND:
     if (flags & HWLOC_MEMBIND_STRICT)
       *linuxpolicy = MPOL_BIND;
@@ -1166,7 +1701,7 @@ hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribut
    * and round up to the nearest multiple of BITS_PER_LONG */
   max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
 
-  linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+  linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(unsigned long));
   if (!linuxmask) {
     hwloc_bitmap_free(linux_nodeset);
     errno = ENOMEM;
@@ -1200,9 +1735,7 @@ hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_
   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
     hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
 }
-#endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
 
-#ifdef HWLOC_HAVE_MBIND
 static int
 hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
@@ -1213,7 +1746,7 @@ hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t
   unsigned linuxflags = 0;
   int err;
 
-  remainder = (uintptr_t) addr & (sysconf(_SC_PAGESIZE)-1);
+  remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
   addr = (char*) addr - remainder;
   len += remainder;
 
@@ -1221,28 +1754,30 @@ hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t
   if (err < 0)
     return err;
 
-  if (linuxpolicy == MPOL_DEFAULT)
+  if (linuxpolicy == MPOL_DEFAULT) {
     /* Some Linux kernels don't like being passed a set */
-    return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
+    return hwloc_mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
+
+  } else if (linuxpolicy == MPOL_LOCAL) {
+    if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
+      errno = EXDEV;
+      return -1;
+    }
+    /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
+    return hwloc_mbind((void *) addr, len, MPOL_PREFERRED, NULL, 0, 0);
+  }
 
   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
   if (err < 0)
     goto out;
 
   if (flags & HWLOC_MEMBIND_MIGRATE) {
-#ifdef MPOL_MF_MOVE
     linuxflags = MPOL_MF_MOVE;
     if (flags & HWLOC_MEMBIND_STRICT)
       linuxflags |= MPOL_MF_STRICT;
-#else
-    if (flags & HWLOC_MEMBIND_STRICT) {
-      errno = ENOSYS;
-      goto out_with_mask;
-    }
-#endif
   }
 
-  err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
+  err = hwloc_mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
   if (err < 0)
     goto out_with_mask;
 
@@ -1262,20 +1797,18 @@ hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nod
   int err;
 
   buffer = hwloc_alloc_mmap(topology, len);
-  if (buffer == MAP_FAILED)
+  if (!buffer)
     return NULL;
 
   err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
-  if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
+  if (err < 0 && (flags & HWLOC_MEMBIND_STRICT)) {
     munmap(buffer, len);
     return NULL;
   }
 
   return buffer;
 }
-#endif /* HWLOC_HAVE_MBIND */
 
-#ifdef HWLOC_HAVE_SET_MEMPOLICY
 static int
 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
 {
@@ -1288,32 +1821,36 @@ hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodese
   if (err < 0)
     return err;
 
-  if (linuxpolicy == MPOL_DEFAULT)
+  if (linuxpolicy == MPOL_DEFAULT) {
     /* Some Linux kernels don't like being passed a set */
-    return set_mempolicy(linuxpolicy, NULL, 0);
+    return hwloc_set_mempolicy(linuxpolicy, NULL, 0);
+
+  } else if (linuxpolicy == MPOL_LOCAL) {
+    if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
+      errno = EXDEV;
+      return -1;
+    }
+    /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
+    return hwloc_set_mempolicy(MPOL_PREFERRED, NULL, 0);
+  }
 
   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
   if (err < 0)
     goto out;
 
   if (flags & HWLOC_MEMBIND_MIGRATE) {
-#ifdef HWLOC_HAVE_MIGRATE_PAGES
-    unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
-    if (fullmask) {
-      memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
-      err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
-      free(fullmask);
-    } else
-      err = -1;
+    unsigned long *fullmask;
+    fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(*fullmask));
+    if (!fullmask)
+      goto out_with_mask;
+    memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(unsigned long));
+    err = hwloc_migrate_pages(0, max_os_index+1, fullmask, linuxmask); /* returns the (positive) number of non-migrated pages on success */
+    free(fullmask);
     if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
       goto out_with_mask;
-#else
-    errno = ENOSYS;
-    goto out_with_mask;
-#endif
   }
 
-  err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
+  err = hwloc_set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
   if (err < 0)
     goto out_with_mask;
 
@@ -1335,22 +1872,48 @@ hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodese
 static int
 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
 {
-  static int max_numnodes = -1;
+  static int _max_numnodes = -1, max_numnodes;
   int linuxpolicy;
+  int fd;
 
-  if (max_numnodes != -1)
+  if (_max_numnodes != -1)
     /* already computed */
-    return max_numnodes;
+    return _max_numnodes;
 
   /* start with a single ulong, it's the minimal and it's enough for most machines */
   max_numnodes = HWLOC_BITS_PER_LONG;
+
+  /* try to get the max from sysfs */
+  fd = open("/sys/devices/system/node/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
+  if (fd >= 0) {
+    hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
+    if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
+      int max_possible = hwloc_bitmap_last(possible_bitmap);
+      hwloc_debug_bitmap("possible NUMA nodes are %s\n", possible_bitmap);
+
+      if (max_numnodes < max_possible + 1)
+        max_numnodes = max_possible + 1;
+    }
+    close(fd);
+    hwloc_bitmap_free(possible_bitmap);
+  }
+
   while (1) {
-    unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
-    int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
+    unsigned long *mask;
+    int err;
+    mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(*mask));
+    if (!mask)
+      /* we can't return anything sane, assume the default size will work */
+      return _max_numnodes = max_numnodes;
+
+    err = hwloc_get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
     free(mask);
     if (!err || errno != EINVAL)
-      /* found it */
-      return max_numnodes;
+      /* Found it. Only update the static value with the final one,
+       * to avoid sharing intermediate values that we modify,
+       * in case there's ever multiple concurrent calls.
+       */
+      return _max_numnodes = max_numnodes;
     max_numnodes *= 2;
   }
 }
@@ -1360,6 +1923,7 @@ hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *pol
 {
   switch (linuxpolicy) {
   case MPOL_DEFAULT:
+  case MPOL_LOCAL: /* converted from MPOL_PREFERRED + empty nodeset by the caller */
     *policy = HWLOC_MEMBIND_FIRSTTOUCH;
     return 0;
   case MPOL_PREFERRED:
@@ -1375,27 +1939,38 @@ hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *pol
   }
 }
 
-static int
-hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+static int hwloc_linux_mask_is_empty(unsigned max_os_index, unsigned long *linuxmask)
 {
-  unsigned max_os_index;
-  unsigned long *linuxmask;
-  int linuxpolicy;
-  int err;
+  unsigned i;
+  for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+    if (linuxmask[i])
+      return 0;
+  return 1;
+}
+
+static int
+hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+  unsigned max_os_index;
+  unsigned long *linuxmask;
+  int linuxpolicy;
+  int err;
 
   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
 
-  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
-  if (!linuxmask) {
-    errno = ENOMEM;
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(*linuxmask));;
+  if (!linuxmask)
     goto out;
-  }
 
-  err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
+  err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
   if (err < 0)
-    goto out_with_mask;
+    goto out_with_linuxmask;
 
-  if (linuxpolicy == MPOL_DEFAULT) {
+  /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
+  if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
+    linuxpolicy = MPOL_LOCAL;
+
+  if (linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
     hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
   } else {
     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
@@ -1403,12 +1978,12 @@ hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t no
 
   err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
   if (err < 0)
-    goto out_with_mask;
+    goto out_with_linuxmask;
 
   free(linuxmask);
   return 0;
 
- out_with_mask:
+ out_with_linuxmask:
   free(linuxmask);
  out:
   return -1;
@@ -1418,8 +1993,9 @@ static int
 hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
 {
   unsigned max_os_index;
-  unsigned long *linuxmask, *globallinuxmask;
-  int linuxpolicy, globallinuxpolicy = 0;
+  unsigned long *linuxmask;
+  unsigned long *globallinuxmask;
+  int linuxpolicy = 0, globallinuxpolicy = 0; /* shut-up the compiler */
   int mixed = 0;
   int full = 0;
   int first = 1;
@@ -1430,23 +2006,23 @@ hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t
 
   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
 
-  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
-  if (!linuxmask) {
-    errno = ENOMEM;
-    goto out;
-  }
-  globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
-  if (!globallinuxmask) {
-    errno = ENOMEM;
-    goto out_with_masks;
-  }
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(*linuxmask));
+  globallinuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(*globallinuxmask));
+  if (!linuxmask || !globallinuxmask)
+    goto out_with_linuxmasks;
+
+  memset(globallinuxmask, 0, sizeof(*globallinuxmask));
 
   for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
       tmpaddr < (char *)addr + len;
       tmpaddr += pagesize) {
-    err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
+    err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
     if (err < 0)
-      goto out_with_masks;
+      goto out_with_linuxmasks;
+
+    /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
+    if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
+      linuxpolicy = MPOL_LOCAL;
 
     /* use the first found policy. if we find a different one later, set mixed to 1 */
     if (first)
@@ -1454,8 +2030,8 @@ hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t
     else if (globallinuxpolicy != linuxpolicy)
       mixed = 1;
 
-    /* agregate masks, and set full to 1 if we ever find DEFAULT */
-    if (full || linuxpolicy == MPOL_DEFAULT) {
+    /* agregate masks, and set full to 1 if we ever find DEFAULT or LOCAL */
+    if (full || linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
       full = 1;
     } else {
       for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
@@ -1470,7 +2046,7 @@ hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t
   } else {
     err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
     if (err < 0)
-      goto out_with_masks;
+      goto out_with_linuxmasks;
   }
 
   if (full) {
@@ -1479,22 +2055,104 @@ hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t
     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
   }
 
-  free(globallinuxmask);
   free(linuxmask);
+  free(globallinuxmask);
   return 0;
 
- out_with_masks:
-  free(globallinuxmask);
+ out_with_linuxmasks:
   free(linuxmask);
- out:
+  free(globallinuxmask);
   return -1;
 }
 
-#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+static int
+hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
+{
+  unsigned offset;
+  unsigned long count;
+  void **pages;
+  int *status;
+  int pagesize = hwloc_getpagesize();
+  int ret;
+  unsigned i;
+
+  offset = ((unsigned long) addr) & (pagesize-1);
+  addr = ((char*) addr) - offset;
+  len += offset;
+  count = (len + pagesize-1)/pagesize;
+  pages = malloc(count*sizeof(*pages));
+  status = malloc(count*sizeof(*status));
+  if (!pages || !status) {
+    ret = -1;
+    goto out_with_pages;
+  }
+
+  for(i=0; i<count; i++)
+    pages[i] = ((char*)addr) + i*pagesize;
+
+  ret = hwloc_move_pages(0, count, pages, NULL, status, 0);
+  if (ret  < 0)
+    goto out_with_pages;
+
+  hwloc_bitmap_zero(nodeset);
+  for(i=0; i<count; i++)
+    if (status[i] >= 0)
+      hwloc_bitmap_set(nodeset, status[i]);
+  ret = 0; /* not really useful since move_pages never returns > 0 */
+
+ out_with_pages:
+  free(pages);
+  free(status);
+  return ret;
+}
+
+static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
+
+static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
+{
+  const char *fsroot_path;
+  char *cpuset_name = NULL;
+  int root_fd = -1;
+
+  fsroot_path = getenv("HWLOC_FSROOT");
+  if (!fsroot_path)
+    fsroot_path = "/";
+
+  if (strcmp(fsroot_path, "/")) {
+#ifdef HAVE_OPENAT
+    root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
+    if (root_fd < 0)
+      goto out;
+#else
+    errno = ENOSYS;
+    goto out;
+#endif
+  }
+
+  /* we could also error-out if the current topology doesn't actually match the system,
+   * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
+   *
+   * Just trust the user when he sets THISSYSTEM=1. It enables hacky
+   * tests such as restricting random XML or synthetic to the current
+   * machine (uses the default cgroup).
+   */
+
+  hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
+  if (cpuset_name) {
+    hwloc__add_info_nodup(&topology->levels[0][0]->infos, &topology->levels[0][0]->infos_count,
+			  "LinuxCgroup", cpuset_name, 1 /* replace */);
+    free(cpuset_name);
+  }
+  if (root_fd != -1)
+    close(root_fd);
+
+ out:
+  return -1;
+}
 
 void
 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
-			struct hwloc_topology_support *support __hwloc_attribute_unused)
+			struct hwloc_topology_support *support)
 {
   hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
   hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
@@ -1511,25 +2169,26 @@ hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
   hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
   hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
   hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
-#ifdef HWLOC_HAVE_SET_MEMPOLICY
   hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
   hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
   hooks->get_area_membind = hwloc_linux_get_area_membind;
-#endif /* HWLOC_HAVE_SET_MEMPOLICY */
-#ifdef HWLOC_HAVE_MBIND
   hooks->set_area_membind = hwloc_linux_set_area_membind;
+  hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
   hooks->alloc_membind = hwloc_linux_alloc_membind;
   hooks->alloc = hwloc_alloc_mmap;
   hooks->free_membind = hwloc_free_mmap;
   support->membind->firsttouch_membind = 1;
   support->membind->bind_membind = 1;
   support->membind->interleave_membind = 1;
-#endif /* HWLOC_HAVE_MBIND */
-#if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
   support->membind->migrate_membind = 1;
-#endif
-}
+  hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
 
+  /* The get_allowed_resources() hook also works in the !thissystem case
+   * (it just reads fsroot files) but hooks are only setup if thissystem.
+   * Not an issue because this hook isn't used unless THISSYSTEM_ALLOWED_RESOURCES
+   * which also requires THISSYSTEM which means this functions is called.
+   */
+}
 
 
 /*******************************************
@@ -1540,202 +2199,61 @@ hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
 struct hwloc_linux_cpuinfo_proc {
   /* set during hwloc_linux_parse_cpuinfo */
   unsigned long Pproc;
-  /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
-  long Pcore, Ppkg;
-  /* set later, or -1 if unknown */
-  long Lcore, Lpkg;
 
   /* custom info, set during hwloc_linux_parse_cpuinfo */
-  struct hwloc_obj_info_s *infos;
+  struct hwloc_info_s *infos;
   unsigned infos_count;
 };
 
-static int
-hwloc_parse_sysfs_unsigned(const char *mappath, unsigned *value, int fsroot_fd)
-{
-  char string[11];
-  FILE * fd;
-
-  fd = hwloc_fopen(mappath, "r", fsroot_fd);
-  if (!fd) {
-    *value = -1;
-    return -1;
-  }
-
-  if (!fgets(string, 11, fd)) {
-    *value = -1;
-    fclose(fd);
-    return -1;
-  }
-  *value = strtoul(string, NULL, 10);
-
-  fclose(fd);
-
-  return 0;
-}
-
-
-/* kernel cpumaps are composed of an array of 32bits cpumasks */
-#define KERNEL_CPU_MASK_BITS 32
-#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
-
-int
-hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
-{
-  unsigned long *maps;
-  unsigned long map;
-  int nr_maps = 0;
-  static int nr_maps_allocated = 8; /* only compute the power-of-two above the kernel cpumask size once */
-  int i;
-
-  maps = malloc(nr_maps_allocated * sizeof(*maps));
-
-  /* reset to zero first */
-  hwloc_bitmap_zero(set);
-
-  /* parse the whole mask */
-  while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
-    {
-      if (nr_maps == nr_maps_allocated) {
-	nr_maps_allocated *= 2;
-	maps = realloc(maps, nr_maps_allocated * sizeof(*maps));
-      }
-
-      if (!map && !nr_maps)
-	/* ignore the first map if it's empty */
-	continue;
-
-      memmove(&maps[1], &maps[0], nr_maps*sizeof(*maps));
-      maps[0] = map;
-      nr_maps++;
-    }
-
-  /* convert into a set */
-#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
-  for(i=0; i<nr_maps; i++)
-    hwloc_bitmap_set_ith_ulong(set, i, maps[i]);
-#else
-  for(i=0; i<(nr_maps+1)/2; i++) {
-    unsigned long mask;
-    mask = maps[2*i];
-    if (2*i+1<nr_maps)
-      mask |= maps[2*i+1] << KERNEL_CPU_MASK_BITS;
-    hwloc_bitmap_set_ith_ulong(set, i, mask);
-  }
-#endif
-
-  free(maps);
-
-  return 0;
-}
-
-static hwloc_bitmap_t
-hwloc_parse_cpumap(const char *mappath, int fsroot_fd)
-{
-  hwloc_bitmap_t set;
-  FILE * file;
-
-  file = hwloc_fopen(mappath, "r", fsroot_fd);
-  if (!file)
-    return NULL;
-
-  set = hwloc_bitmap_alloc();
-  hwloc_linux_parse_cpumap_file(file, set);
-
-  fclose(file);
-  return set;
-}
-
-static char *
-hwloc_strdup_mntpath(const char *escapedpath, size_t length)
-{
-  char *path = malloc(length+1);
-  const char *src = escapedpath, *tmp;
-  char *dst = path;
-
-  while ((tmp = strchr(src, '\\')) != NULL) {
-    strncpy(dst, src, tmp-src);
-    dst += tmp-src;
-    if (!strncmp(tmp+1, "040", 3))
-      *dst = ' ';
-    else if (!strncmp(tmp+1, "011", 3))
-      *dst = '	';
-    else if (!strncmp(tmp+1, "012", 3))
-      *dst = '\n';
-    else
-      *dst = '\\';
-    dst++;
-    src = tmp+4;
-  }
-
-  strcpy(dst, src);
-
-  return path;
-}
-
 static void
-hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, int fsroot_fd)
+hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
 {
-#define PROC_MOUNT_LINE_LEN 512
-  char line[PROC_MOUNT_LINE_LEN];
+  char *mount_path;
+  struct mntent mntent;
+  char *buf;
   FILE *fd;
+  int err;
+  size_t bufsize;
 
   *cgroup_mntpnt = NULL;
   *cpuset_mntpnt = NULL;
 
-  /* ideally we should use setmntent, getmntent, hasmntopt and endmntent,
-   * but they do not support fsroot_fd.
-   */
-
-  fd = hwloc_fopen("/proc/mounts", "r", fsroot_fd);
+  if (root_path) {
+    /* setmntent() doesn't support openat(), so use the root_path directly */
+    err = asprintf(&mount_path, "%s/proc/mounts", root_path);
+    if (err < 0)
+      return;
+    fd = setmntent(mount_path, "r");
+    free(mount_path);
+  } else {
+    fd = setmntent("/proc/mounts", "r");
+  }
   if (!fd)
     return;
 
-  while (fgets(line, sizeof(line), fd)) {
-    char *path;
-    char *type;
-    char *tmp;
-
-    /* remove the ending " 0 0\n" that the kernel always adds */
-    tmp = line + strlen(line) - 5;
-    if (tmp < line || strcmp(tmp, " 0 0\n"))
-      fprintf(stderr, "Unexpected end of /proc/mounts line `%s'\n", line);
-    else
-      *tmp = '\0';
-
-    /* path is after first field and a space */
-    tmp = strchr(line, ' ');
-    if (!tmp)
-      continue;
-    path = tmp+1;
-
-    /* type is after path, which may not contain spaces since the kernel escaped them to \040
-     * (see the manpage of getmntent) */
-    tmp = strchr(path, ' ');
-    if (!tmp)
-      continue;
-    type = tmp+1;
-    /* mark the end of path to ease upcoming strdup */
-    *tmp = '\0';
+  /* getmntent_r() doesn't actually report an error when the buffer
+   * is too small. It just silently truncates things. So we can't
+   * dynamically resize things.
+   *
+   * Linux limits mount type, string, and options to one page each.
+   * getmntent() limits the line size to 4kB.
+   * so use 4*pagesize to be far above both.
+   */
+  bufsize = hwloc_getpagesize()*4;
+  buf = malloc(bufsize);
+  if (!buf)
+    return;
 
-    if (!strncmp(type, "cpuset ", 7)) {
-      /* found a cpuset mntpnt */
-      hwloc_debug("Found cpuset mount point on %s\n", path);
-      *cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+  while (getmntent_r(fd, &mntent, buf, bufsize)) {
+    if (!strcmp(mntent.mnt_type, "cpuset")) {
+      hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
+      *cpuset_mntpnt = strdup(mntent.mnt_dir);
       break;
-
-    } else if (!strncmp(type, "cgroup ", 7)) {
+    } else if (!strcmp(mntent.mnt_type, "cgroup")) {
       /* found a cgroup mntpnt */
-      char *opt, *opts;
+      char *opt, *opts = mntent.mnt_opts;
       int cpuset_opt = 0;
       int noprefix_opt = 0;
-
-      /* find options */
-      tmp = strchr(type, ' ');
-      if (!tmp)
-	continue;
-      opts = tmp+1;
-
       /* look at options */
       while ((opt = strsep(&opts, ",")) != NULL) {
 	if (!strcmp(opt, "cpuset"))
@@ -1745,19 +2263,19 @@ hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, int f
       }
       if (!cpuset_opt)
 	continue;
-
       if (noprefix_opt) {
-	hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", path);
-	*cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+	hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
+	*cpuset_mntpnt = strdup(mntent.mnt_dir);
       } else {
-	hwloc_debug("Found cgroup/cpuset mount point on %s\n", path);
-	*cgroup_mntpnt = hwloc_strdup_mntpath(path, type-path);
+	hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
+	*cgroup_mntpnt = strdup(mntent.mnt_dir);
       }
       break;
     }
   }
 
-  fclose(fd);
+  endmntent(fd);
+  free(buf);
 }
 
 /*
@@ -1771,22 +2289,23 @@ hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
 {
 #define CPUSET_NAME_LEN 128
   char cpuset_name[CPUSET_NAME_LEN];
-  FILE *fd;
+  FILE *file;
+  int err;
   char *tmp;
 
   /* check whether a cgroup-cpuset is enabled */
   if (!pid)
-    fd = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
+    file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
   else {
-    char path[] = "/proc/XXXXXXXXXX/cgroup";
+    char path[] = "/proc/XXXXXXXXXXX/cgroup";
     snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
-    fd = hwloc_fopen(path, "r", fsroot_fd);
+    file = hwloc_fopen(path, "r", fsroot_fd);
   }
-  if (fd) {
+  if (file) {
     /* find a cpuset line */
 #define CGROUP_LINE_LEN 256
     char line[CGROUP_LINE_LEN];
-    while (fgets(line, sizeof(line), fd)) {
+    while (fgets(line, sizeof(line), file)) {
       char *end, *colon = strchr(line, ':');
       if (!colon)
 	continue;
@@ -1794,35 +2313,31 @@ hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
 	continue;
 
       /* found a cgroup-cpuset line, return the name */
-      fclose(fd);
+      fclose(file);
       end = strchr(colon, '\n');
       if (end)
 	*end = '\0';
       hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
       return strdup(colon+8);
     }
-    fclose(fd);
+    fclose(file);
   }
 
   /* check whether a cpuset is enabled */
   if (!pid)
-    fd = hwloc_fopen("/proc/self/cpuset", "r", fsroot_fd);
+    err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
   else {
-    char path[] = "/proc/XXXXXXXXXX/cpuset";
+    char path[] = "/proc/XXXXXXXXXXX/cpuset";
     snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
-    fd = hwloc_fopen(path, "r", fsroot_fd);
+    err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
   }
-  if (!fd) {
+  if (err < 0) {
     /* found nothing */
     hwloc_debug("%s", "No cgroup or cpuset found\n");
     return NULL;
   }
 
   /* found a cpuset, return the name */
-  tmp = fgets(cpuset_name, sizeof(cpuset_name), fd);
-  fclose(fd);
-  if (!tmp)
-    return NULL;
   tmp = strchr(cpuset_name, '\n');
   if (tmp)
     *tmp = '\0';
@@ -1835,143 +2350,53 @@ hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
  * the cpuset filesystem (usually mounted in / or /dev) where there
  * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
  */
-static char *
-hwloc_read_linux_cpuset_mask(const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name, const char *attr_name, int fsroot_fd)
+static void
+hwloc_admin_disable_set_from_cpuset(int root_fd,
+				    const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
+				    const char *attr_name,
+				    hwloc_bitmap_t admin_enabled_set)
 {
 #define CPUSET_FILENAME_LEN 256
   char cpuset_filename[CPUSET_FILENAME_LEN];
-  FILE *fd;
-  char *info = NULL, *tmp;
-  ssize_t ssize;
-  size_t size;
+  int err;
 
   if (cgroup_mntpnt) {
     /* try to read the cpuset from cgroup */
     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
     hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
-    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
-    if (fd)
-      goto gotfile;
   } else if (cpuset_mntpnt) {
     /* try to read the cpuset directly */
     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
     hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
-    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
-    if (fd)
-      goto gotfile;
   }
 
-  /* found no cpuset description, ignore it */
-  hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
-  goto out;
-
-gotfile:
-  ssize = getline(&info, &size, fd);
-  fclose(fd);
-  if (ssize < 0)
-    goto out;
-  if (!info)
-    goto out;
-
-  tmp = strchr(info, '\n');
-  if (tmp)
-    *tmp = '\0';
-
-out:
-  return info;
-}
-
-static void
-hwloc_admin_disable_set_from_cpuset(struct hwloc_linux_backend_data_s *data,
-				    const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
-				    const char *attr_name,
-				    hwloc_bitmap_t admin_enabled_cpus_set)
-{
-  char *cpuset_mask;
-  char *current, *comma, *tmp;
-  int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
-  hwloc_bitmap_t tmpset;
-
-  cpuset_mask = hwloc_read_linux_cpuset_mask(cgroup_mntpnt, cpuset_mntpnt, cpuset_name,
-					     attr_name, data->root_fd);
-  if (!cpuset_mask)
-    return;
-
-  hwloc_debug("found cpuset %s: %s\n", attr_name, cpuset_mask);
-
-  current = cpuset_mask;
-  prevlast = -1;
-
-  while (1) {
-    /* save a pointer to the next comma and erase it to simplify things */
-    comma = strchr(current, ',');
-    if (comma)
-      *comma = '\0';
-
-    /* find current enabled-segment bounds */
-    nextfirst = strtoul(current, &tmp, 0);
-    if (*tmp == '-')
-      nextlast = strtoul(tmp+1, NULL, 0);
-    else
-      nextlast = nextfirst;
-    if (prevlast+1 <= nextfirst-1) {
-      hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
-      hwloc_bitmap_clr_range(admin_enabled_cpus_set, prevlast+1, nextfirst-1);
-    }
+  err = hwloc__read_path_as_cpulist(cpuset_filename, admin_enabled_set, root_fd);
 
-    /* switch to next enabled-segment */
-    prevlast = nextlast;
-    if (!comma)
-      break;
-    current = comma+1;
+  if (err < 0) {
+    hwloc_debug("failed to read cpuset '%s' attribute '%s'\n", cpuset_name, attr_name);
+    hwloc_bitmap_fill(admin_enabled_set);
+  } else {
+    hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_set);
   }
-
-  hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
-  /* no easy way to clear until the infinity */
-  tmpset = hwloc_bitmap_alloc();
-  hwloc_bitmap_set_range(tmpset, 0, prevlast);
-  hwloc_bitmap_and(admin_enabled_cpus_set, admin_enabled_cpus_set, tmpset);
-  hwloc_bitmap_free(tmpset);
-
-  free(cpuset_mask);
 }
 
 static void
 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
 			 const char *path,
-			 int prefixlength,
-			 uint64_t *local_memory,
-			 uint64_t *meminfo_hugepages_count,
-			 uint64_t *meminfo_hugepages_size,
-			 int onlytotal)
+			 uint64_t *local_memory)
 {
-  char string[64];
-  FILE *fd;
+  char *tmp;
+  char buffer[4096];
+  unsigned long long number;
 
-  fd = hwloc_fopen(path, "r", data->root_fd);
-  if (!fd)
+  if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
     return;
 
-  while (fgets(string, sizeof(string), fd) && *string != '\0')
-    {
-      unsigned long long number;
-      if (strlen(string) < (size_t) prefixlength)
-        continue;
-      if (sscanf(string+prefixlength, "MemTotal: %llu kB", (unsigned long long *) &number) == 1) {
-	*local_memory = number << 10;
-	if (onlytotal)
-	  break;
-      }
-      else if (!onlytotal) {
-	if (sscanf(string+prefixlength, "Hugepagesize: %llu", (unsigned long long *) &number) == 1)
-	  *meminfo_hugepages_size = number << 10;
-	else if (sscanf(string+prefixlength, "HugePages_Free: %llu", (unsigned long long *) &number) == 1)
-          /* these are free hugepages, not the total amount of huge pages */
-	  *meminfo_hugepages_count = number;
-      }
-    }
-
-  fclose(fd);
+  tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
+  if (tmp) {
+    number = strtoull(tmp+10, NULL, 10);
+    *local_memory = number << 10;
+  }
 }
 
 #define SYSFS_NUMA_NODE_PATH_LEN 128
@@ -1979,32 +2404,29 @@ hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
 static void
 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
 			   const char *dirpath,
-			   struct hwloc_obj_memory_s *memory,
+			   struct hwloc_numanode_attr_s *memory,
 			   uint64_t *remaining_local_memory)
 {
   DIR *dir;
   struct dirent *dirent;
-  unsigned long index_ = 1;
-  FILE *hpfd;
+  unsigned long index_ = 1; /* slot 0 is for normal pages */
   char line[64];
   char path[SYSFS_NUMA_NODE_PATH_LEN];
 
   dir = hwloc_opendir(dirpath, data->root_fd);
   if (dir) {
     while ((dirent = readdir(dir)) != NULL) {
+      int err;
       if (strncmp(dirent->d_name, "hugepages-", 10))
         continue;
       memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
-      sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
-      hpfd = hwloc_fopen(path, "r", data->root_fd);
-      if (hpfd) {
-        if (fgets(line, sizeof(line), hpfd)) {
-          /* these are the actual total amount of huge pages */
-          memory->page_types[index_].count = strtoull(line, NULL, 0);
-          *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
-          index_++;
-        }
-	fclose(hpfd);
+      err = snprintf(path, sizeof(path), "%s/%s/nr_hugepages", dirpath, dirent->d_name);
+      if ((size_t) err < sizeof(path)
+	  && !hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
+	/* these are the actual total amount of huge pages */
+	memory->page_types[index_].count = strtoull(line, NULL, 0);
+	*remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
+	index_++;
       }
     }
     closedir(dir);
@@ -2013,15 +2435,13 @@ hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
 }
 
 static void
-hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
-			      struct hwloc_linux_backend_data_s *data,
-			      struct hwloc_obj_memory_s *memory)
+hwloc_get_machine_meminfo(struct hwloc_linux_backend_data_s *data,
+			  struct hwloc_numanode_attr_s *memory)
 {
-  uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
   struct stat st;
   int has_sysfs_hugepages = 0;
-  const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
-  int types = 2;
+  int types = 1; /* only normal pages by default */
+  uint64_t remaining_local_memory;
   int err;
 
   err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
@@ -2030,71 +2450,39 @@ hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
     has_sysfs_hugepages = 1;
   }
 
-  if (topology->is_thissystem || pagesize_env) {
-    /* we cannot report any page_type info unless we have the page size.
-     * we'll take it either from the system if local, or from the debug env variable
-     */
-    memory->page_types_len = types;
-    memory->page_types = calloc(types, sizeof(*memory->page_types));
-  }
-
-  if (topology->is_thissystem) {
-    /* Get the page and hugepage sizes from sysconf */
-#ifdef HAVE__SC_LARGE_PAGESIZE
-    memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
-#endif
-    memory->page_types[0].size = hwloc_getpagesize(); /* might be overwritten later by /proc/meminfo or sysfs */
+  memory->page_types = calloc(types, sizeof(*memory->page_types));
+  if (!memory->page_types) {
+    memory->page_types_len = 0;
+    return;
   }
+  memory->page_types_len = 1; /* we'll increase it when successfully getting hugepage info */
 
-  hwloc_parse_meminfo_info(data, "/proc/meminfo", 0 /* no prefix */,
-			   &memory->local_memory,
-			   &meminfo_hugepages_count, &meminfo_hugepages_size,
-			   memory->page_types == NULL);
-
-  if (memory->page_types) {
-    uint64_t remaining_local_memory = memory->local_memory;
-    if (has_sysfs_hugepages) {
-      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
-      hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
-    } else {
-      /* use what we found in meminfo */
-      if (meminfo_hugepages_size) {
-        memory->page_types[1].size = meminfo_hugepages_size;
-        memory->page_types[1].count = meminfo_hugepages_count;
-        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
-      } else {
-        memory->page_types_len = 1;
-      }
-    }
-
-    if (pagesize_env) {
-      /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
-      memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
-      /* If failed, use 4kB */
-      if (!memory->page_types[0].size)
-	memory->page_types[0].size = 4096;
-    }
-    assert(memory->page_types[0].size); /* from sysconf if local or from the env */
-    /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
-     * may be 0 if no hugepage support in the kernel */
+  /* get the total memory */
+  hwloc_parse_meminfo_info(data, "/proc/meminfo",
+			   &memory->local_memory);
+  remaining_local_memory = memory->local_memory;
 
-    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+  if (has_sysfs_hugepages) {
+    /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+    hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
   }
+
+  /* use remaining memory as normal pages */
+  memory->page_types[0].size = data->pagesize;
+  memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
 }
 
 static void
-hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
-			      struct hwloc_linux_backend_data_s *data,
-			      const char *syspath, int node,
-			      struct hwloc_obj_memory_s *memory)
+hwloc_get_sysfs_node_meminfo(struct hwloc_linux_backend_data_s *data,
+			     const char *syspath, int node,
+			     struct hwloc_numanode_attr_s *memory)
 {
   char path[SYSFS_NUMA_NODE_PATH_LEN];
   char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
-  uint64_t meminfo_hugepages_count = 0;
-  uint64_t meminfo_hugepages_size = 0;
   struct stat st;
   int has_sysfs_hugepages = 0;
-  int types = 2;
+  int types = 1; /* only normal pages by default */
+  uint64_t remaining_local_memory;
   int err;
 
   sprintf(path, "%s/node%d/hugepages", syspath, node);
@@ -2104,73 +2492,77 @@ hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
     has_sysfs_hugepages = 1;
   }
 
-  if (topology->is_thissystem) {
-    memory->page_types_len = types;
-    memory->page_types = malloc(types*sizeof(*memory->page_types));
-    memset(memory->page_types, 0, types*sizeof(*memory->page_types));
+  memory->page_types = calloc(types, sizeof(*memory->page_types));
+  if (!memory->page_types) {
+    memory->page_types_len = 0;
+    return;
   }
+  memory->page_types_len = 1; /* we'll increase it when successfully getting hugepage info */
 
+  /* get the total memory */
   sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
   hwloc_parse_meminfo_info(data, meminfopath,
-			   snprintf(NULL, 0, "Node %d ", node),
-			   &memory->local_memory,
-			   &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
-			   memory->page_types == NULL);
-
-  if (memory->page_types) {
-    uint64_t remaining_local_memory = memory->local_memory;
-    if (has_sysfs_hugepages) {
-      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
-      hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
-    } else {
-      /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
-       * hwloc_get_procfs_meminfo_info must have been called earlier */
-      meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
-      /* use what we found in meminfo */
-      if (meminfo_hugepages_size) {
-        memory->page_types[1].count = meminfo_hugepages_count;
-        memory->page_types[1].size = meminfo_hugepages_size;
-        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
-      } else {
-        memory->page_types_len = 1;
-      }
-    }
-    /* update what's remaining as normal pages */
-    memory->page_types[0].size = hwloc_getpagesize();
-    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+			   &memory->local_memory);
+  remaining_local_memory = memory->local_memory;
+
+  if (has_sysfs_hugepages) {
+    /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+    hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
   }
+
+  /* use remaining memory as normal pages */
+  memory->page_types[0].size = data->pagesize;
+  memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
 }
 
-static void
-hwloc_parse_node_distance(const char *distancepath, unsigned nbnodes, float *distances, int fsroot_fd)
+static int
+hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, uint64_t *distances, int fsroot_fd)
 {
-  char string[4096]; /* enough for hundreds of nodes */
-  char *tmp, *next;
-  FILE * fd;
+  size_t len = (10+1)*nbnodes;
+  uint64_t *curdist = distances;
+  char *string;
+  unsigned i;
 
-  fd = hwloc_fopen(distancepath, "r", fsroot_fd);
-  if (!fd)
-    return;
+  string = malloc(len); /* space-separated %d */
+  if (!string)
+    goto out;
 
-  if (!fgets(string, sizeof(string), fd)) {
-    fclose(fd);
-    return;
+  for(i=0; i<nbnodes; i++) {
+    unsigned osnode = indexes[i];
+    char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
+    char *tmp, *next;
+    unsigned found;
+
+    /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
+     * store them in slots X*N...X*N+N-1 */
+    sprintf(distancepath, "%s/node%u/distance", path, osnode);
+    if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
+      goto out_with_string;
+
+    tmp = string;
+    found = 0;
+    while (tmp) {
+      unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
+      if (next == tmp)
+	break;
+      *curdist = (uint64_t) distance;
+      curdist++;
+      found++;
+      if (found == nbnodes)
+	break;
+      tmp = next+1;
+    }
+    if (found != nbnodes)
+      goto out_with_string;
   }
 
-  tmp = string;
-  while (tmp) {
-    unsigned distance = strtoul(tmp, &next, 0);
-    if (next == tmp)
-      break;
-    *distances = (float) distance;
-    distances++;
-    nbnodes--;
-    if (!nbnodes)
-      break;
-    tmp = next+1;
-  }
+  free(string);
+  return 0;
 
-  fclose(fd);
+ out_with_string:
+  free(string);
+ out:
+  return -1;
 }
 
 static void
@@ -2180,20 +2572,13 @@ hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
 			   const char *dmi_name, const char *hwloc_name)
 {
   char dmi_line[64];
-  char *tmp;
-  FILE *fd;
 
   strcpy(path+pathlen, dmi_name);
-  fd = hwloc_fopen(path, "r", data->root_fd);
-  if (!fd)
+  if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
     return;
 
-  dmi_line[0] = '\0';
-  tmp = fgets(dmi_line, sizeof(dmi_line), fd);
-  fclose (fd);
-
-  if (tmp && dmi_line[0] != '\0') {
-    tmp = strchr(dmi_line, '\n');
+  if (dmi_line[0] != '\0') {
+    char *tmp = strchr(dmi_line, '\n');
     if (tmp)
       *tmp = '\0';
     hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
@@ -2244,182 +2629,6 @@ hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
 }
 
-struct hwloc_firmware_dmi_mem_device_header {
-  unsigned char type;
-  unsigned char length;
-  unsigned char handle[2];
-  unsigned char phy_mem_handle[2];
-  unsigned char mem_err_handle[2];
-  unsigned char tot_width[2];
-  unsigned char dat_width[2];
-  unsigned char size[2];
-  unsigned char ff;
-  unsigned char dev_set;
-  unsigned char dev_loc_str_num;
-  unsigned char bank_loc_str_num;
-  unsigned char mem_type;
-  unsigned char type_detail[2];
-  unsigned char speed[2];
-  unsigned char manuf_str_num;
-  unsigned char serial_str_num;
-  unsigned char asset_tag_str_num;
-  unsigned char part_num_str_num;
-  /* don't include the following fields since we don't need them,
-   * some old implementations may miss them.
-   */
-};
-
-static int check_dmi_entry(const char *buffer)
-{
-  /* reject empty strings */
-  if (!*buffer)
-    return 0;
-  /* reject strings of spaces (at least Dell use this for empty memory slots) */
-  if (strspn(buffer, " ") == strlen(buffer))
-    return 0;
-  return 1;
-}
-
-static void
-hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
-					unsigned idx, const char *path, FILE *fd,
-					struct hwloc_firmware_dmi_mem_device_header *header)
-{
-  unsigned slen;
-  char buffer[256]; /* enough for memory device strings, or at least for each of them */
-  unsigned foff; /* offset in raw file */
-  unsigned boff; /* offset in buffer read from raw file */
-  unsigned i;
-  struct hwloc_obj_info_s *infos = NULL;
-  unsigned infos_count = 0;
-  hwloc_obj_t misc;
-  int foundinfo = 0;
-
-  hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
-
-  /* start after the header */
-  foff = header->length;
-  i = 1;
-  while (1) {
-    /* read one buffer */
-    if (fseek(fd, foff, SEEK_SET) < 0)
-      break;
-    if (!fgets(buffer, sizeof(buffer), fd))
-      break;
-    /* read string at the beginning of the buffer */
-    boff = 0;
-    while (1) {
-      /* stop on empty string */
-      if (!buffer[boff])
-        goto done;
-      /* stop if this string goes to the end of the buffer */
-      slen = strlen(buffer+boff);
-      if (boff + slen+1 == sizeof(buffer))
-        break;
-      /* string didn't get truncated, should be OK */
-      if (i == header->manuf_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
-	  foundinfo = 1;
-	}
-      }	else if (i == header->serial_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
-	  foundinfo = 1;
-	}
-      } else if (i == header->asset_tag_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
-	  foundinfo = 1;
-	}
-      } else if (i == header->part_num_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
-	  foundinfo = 1;
-	}
-      } else if (i == header->dev_loc_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
-	  /* only a location, not an actual info about the device */
-	}
-      } else if (i == header->bank_loc_str_num) {
-	if (check_dmi_entry(buffer+boff)) {
-	  hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
-	  /* only a location, not an actual info about the device */
-	}
-      } else {
-	goto done;
-      }
-      /* next string in buffer */
-      boff += slen+1;
-      i++;
-    }
-    /* couldn't read a single full string from that buffer, we're screwed */
-    if (!boff) {
-      fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
-	      i, path);
-      break;
-    }
-    /* reread buffer after previous string */
-    foff += boff;
-  }
-
-done:
-  if (!foundinfo) {
-    /* found no actual info about the device. if there's only location info, the slot may be empty */
-    goto out_with_infos;
-  }
-
-  misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
-  if (!misc)
-    goto out_with_infos;
-
-  hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
-  /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
-   * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
-   * with the vendor, and it's hard to be 100% sure 'B' is second socket.
-   * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
-   * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
-   */
-  hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
-  return;
-
- out_with_infos:
-  hwloc__free_infos(infos, infos_count);
-}
-
-static void
-hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
-				    struct hwloc_linux_backend_data_s *data)
-{
-  char path[128];
-  unsigned i;
-
-  for(i=0; ; i++) {
-    FILE *fd;
-    struct hwloc_firmware_dmi_mem_device_header header;
-    int err;
-
-    snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
-    fd = hwloc_fopen(path, "r", data->root_fd);
-    if (!fd)
-      break;
-
-    err = fread(&header, sizeof(header), 1, fd);
-    if (err != 1)
-      break;
-    if (header.length < sizeof(header)) {
-      /* invalid, or too old entry/spec that doesn't contain what we need */
-      fclose(fd);
-      break;
-    }
-
-    hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
-
-    fclose(fd);
-  }
-}
-
 
 /***********************************
  ****** Device tree Discovery ******
@@ -2471,7 +2680,12 @@ hwloc_read_str(const char *p, const char *p1, int root_fd)
   size_t cb = 0;
   char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
   if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
-    ret = realloc(ret, cb + 1);
+    char *tmp = realloc(ret, cb + 1);
+    if (!tmp) {
+      free(ret);
+      return NULL;
+    }
+    ret = tmp;
     ret[cb] = 0;
   }
   return ret;
@@ -2508,11 +2722,17 @@ add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
     uint32_t l2_cache, uint32_t phandle, const char *name)
 {
   if (cpus->n == cpus->allocated) {
+    void *tmp;
+    unsigned allocated;
     if (!cpus->allocated)
-      cpus->allocated = 64;
+      allocated = 64;
     else
-      cpus->allocated *= 2;
-    cpus->p = realloc(cpus->p, cpus->allocated * sizeof(cpus->p[0]));
+      allocated = 2 * cpus->allocated;
+    tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
+    if (!tmp)
+      return; /* failed to realloc, ignore this entry */
+    cpus->p = tmp;
+    cpus->allocated = allocated;
   }
   cpus->p[cpus->n].phandle = phandle;
   cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
@@ -2548,20 +2768,27 @@ look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
 
 static void
 try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
-				    unsigned int level, hwloc_obj_cache_type_t type,
+				    unsigned int level, hwloc_obj_cache_type_t ctype,
 				    uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
 				    hwloc_bitmap_t cpuset)
 {
   struct hwloc_obj *c = NULL;
+  hwloc_obj_type_t otype;
 
   if (0 == cache_size)
     return;
 
-  c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+  otype = hwloc_cache_type_by_depth_type(level, ctype);
+  if (otype == HWLOC_OBJ_TYPE_NONE)
+    return;
+  if (!hwloc_filter_check_keep_object_type(topology, otype))
+    return;
+
+  c = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
   c->attr->cache.depth = level;
   c->attr->cache.linesize = cache_line_size;
   c->attr->cache.size = cache_size;
-  c->attr->cache.type = type;
+  c->attr->cache.type = ctype;
   if (cache_sets == 1)
     /* likely wrong, make it unknown */
     cache_sets = 0;
@@ -2570,8 +2797,8 @@ try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
   else
     c->attr->cache.associativity = 0;
   c->cpuset = hwloc_bitmap_dup(cpuset);
-  hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
-			   type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
+  hwloc_debug_2args_bitmap("cache (%s) depth %u has cpuset %s\n",
+			   ctype == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (ctype == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
 			   level, c->cpuset);
   hwloc_insert_object_by_cpuset(topology, c);
 }
@@ -2638,8 +2865,10 @@ look_powerpc_device_tree(struct hwloc_topology *topology,
     return;
 
   /* only works for Power so far, and not useful on ARM */
-  if (strncmp(data->utsname.machine, "ppc", 3))
+  if (data->arch != HWLOC_LINUX_ARCH_POWER) {
+    closedir(dt);
     return;
+  }
 
   cpus.n = 0;
   cpus.p = NULL;
@@ -2649,11 +2878,14 @@ look_powerpc_device_tree(struct hwloc_topology *topology,
     char cpu[256];
     char *device_type;
     uint32_t reg = -1, l2_cache = -1, phandle = -1;
+    int err;
 
     if ('.' == dirent->d_name[0])
       continue;
 
-    snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
+    err = snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
+    if ((size_t) err >= sizeof(cpu))
+      continue;
 
     device_type = hwloc_read_str(cpu, "device_type", root_fd);
     if (NULL == device_type)
@@ -2698,10 +2930,12 @@ look_powerpc_device_tree(struct hwloc_topology *topology,
         struct hwloc_obj *core = NULL;
         add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
 
-        /* Add core */
-        core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
-        core->cpuset = hwloc_bitmap_dup(cpuset);
-        hwloc_insert_object_by_cpuset(topology, core);
+	if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+	  /* Add core */
+	  core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, (unsigned) reg);
+	  core->cpuset = hwloc_bitmap_dup(cpuset);
+	  hwloc_insert_object_by_cpuset(topology, core);
+	}
 
         /* Add L1 cache */
         try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
@@ -2721,7 +2955,7 @@ look_powerpc_device_tree(struct hwloc_topology *topology,
 
 #ifdef HWLOC_DEBUG
   for (i = 0; i < cpus.n; ++i) {
-    hwloc_debug("%i: %s  ibm,phandle=%08X l2_cache=%08X ",
+    hwloc_debug("%u: %s  ibm,phandle=%08X l2_cache=%08X ",
       i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
     if (NULL == cpus.p[i].cpuset) {
       hwloc_debug("%s\n", "no cpuset");
@@ -2758,1629 +2992,2750 @@ look_powerpc_device_tree(struct hwloc_topology *topology,
   free(cpus.p);
 }
 
+/***************************************
+ * KNL NUMA quirks
+ */
 
+struct knl_hwdata {
+  char memory_mode[32];
+  char cluster_mode[32];
+  long long int mcdram_cache_size; /* mcdram_cache_* is valid only if size > 0 */
+  int mcdram_cache_associativity;
+  int mcdram_cache_inclusiveness;
+  int mcdram_cache_line_size;
+};
 
-/**************************************
- ****** Sysfs Topology Discovery ******
- **************************************/
+struct knl_distances_summary {
+  unsigned nb_values; /* number of different values found in the matrix */
+  struct knl_distances_value {
+    unsigned occurences;
+    uint64_t value;
+  } values[4]; /* sorted by occurences */
+};
+
+static int hwloc_knl_distances_value_compar(const void *_v1, const void *_v2)
+{
+  const struct knl_distances_value *v1 = _v1, *v2 = _v2;
+  return v1->occurences - v2->occurences;
+}
 
 static int
-look_sysfsnode(struct hwloc_topology *topology,
-	       struct hwloc_linux_backend_data_s *data,
-	       const char *path, unsigned *found)
+hwloc_linux_knl_parse_numa_distances(unsigned nbnodes,
+				     uint64_t *distances,
+				     struct knl_distances_summary *summary)
 {
-  unsigned osnode;
-  unsigned nbnodes = 0;
-  DIR *dir;
-  struct dirent *dirent;
-  hwloc_bitmap_t nodeset;
+  unsigned i, j, k;
 
-  *found = 0;
+  summary->nb_values = 1;
+  summary->values[0].value = 10;
+  summary->values[0].occurences = nbnodes;
 
-  /* Get the list of nodes first */
-  dir = hwloc_opendir(path, data->root_fd);
-  if (dir)
-    {
-      nodeset = hwloc_bitmap_alloc();
-      while ((dirent = readdir(dir)) != NULL)
-	{
-	  if (strncmp(dirent->d_name, "node", 4))
-	    continue;
-	  osnode = strtoul(dirent->d_name+4, NULL, 0);
-	  hwloc_bitmap_set(nodeset, osnode);
-	  nbnodes++;
-	}
-      closedir(dir);
-    }
-  else
+  if (nbnodes == 1)
+    /* nothing else needed */
+    return 0;
+
+  if (nbnodes != 2 && nbnodes != 4 && nbnodes != 8) {
+    fprintf(stderr, "Ignoring KNL NUMA quirk, nbnodes (%u) isn't 2, 4 or 8.\n", nbnodes);
     return -1;
+  }
 
-  if (nbnodes <= 1)
-    {
-      hwloc_bitmap_free(nodeset);
-      return 0;
-    }
-
-  /* For convenience, put these declarations inside a block. */
+  if (!distances) {
+    fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix missing.\n");
+    return -1;
+  }
 
-  {
-      hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
-      unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
-      float * distances;
-      int failednodes = 0;
-      unsigned index_;
-
-      if (NULL == nodes || NULL == indexes) {
-          free(nodes);
-          free(indexes);
-          hwloc_bitmap_free(nodeset);
-          nbnodes = 0;
-          goto out;
+  for(i=0; i<nbnodes; i++) {
+    /* check we have 10 on the diagonal */
+    if (distances[i*nbnodes+i] != 10) {
+      fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix does not contain 10 on the diagonal.\n");
+      return -1;
+    }
+    for(j=i+1; j<nbnodes; j++) {
+      uint64_t distance = distances[i*nbnodes+j];
+      /* check things are symmetric */
+      if (distance != distances[i+j*nbnodes]) {
+	fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix isn't symmetric.\n");
+	return -1;
+      }
+      /* check everything outside the diagonal is > 10 */
+      if (distance <= 10) {
+	fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix contains values <= 10.\n");
+	return -1;
+      }
+      /* did we already see this value? */
+      for(k=0; k<summary->nb_values; k++)
+	if (distance == summary->values[k].value) {
+	  summary->values[k].occurences++;
+	  break;
+	}
+      if (k == summary->nb_values) {
+	/* add a new value */
+	if (k == 4) {
+	  fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix contains more than 4 different values.\n");
+	  return -1;
+	}
+	summary->values[k].value = distance;
+	summary->values[k].occurences = 1;
+	summary->nb_values++;
       }
+    }
+  }
 
-      /* Unsparsify node indexes.
-       * We'll need them later because Linux groups sparse distances
-       * and keeps them in order in the sysfs distance files.
-       * It'll simplify things in the meantime.
-       */
-      index_ = 0;
-      hwloc_bitmap_foreach_begin (osnode, nodeset) {
-	indexes[index_] = osnode;
-	index_++;
-      } hwloc_bitmap_foreach_end();
-      hwloc_bitmap_free(nodeset);
+  qsort(summary->values, summary->nb_values, sizeof(struct knl_distances_value), hwloc_knl_distances_value_compar);
 
-#ifdef HWLOC_DEBUG
-      hwloc_debug("%s", "NUMA indexes: ");
-      for (index_ = 0; index_ < nbnodes; index_++) {
-	hwloc_debug(" %u", indexes[index_]);
-      }
-      hwloc_debug("%s", "\n");
-#endif
+  if (nbnodes == 2) {
+    if (summary->nb_values != 2) {
+      fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix for 2 nodes cannot contain %u different values instead of 2.\n",
+	      summary->nb_values);
+      return -1;
+    }
 
-      /* Create NUMA objects */
-      for (index_ = 0; index_ < nbnodes; index_++) {
-          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
-          hwloc_bitmap_t cpuset;
-          hwloc_obj_t node, res_obj;
+  } else if (nbnodes == 4) {
+    if (summary->nb_values != 2 && summary->nb_values != 4) {
+      fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix for 8 nodes cannot contain %u different values instead of 2 or 4.\n",
+	      summary->nb_values);
+      return -1;
+    }
 
-	  osnode = indexes[index_];
+  } else if (nbnodes == 8) {
+    if (summary->nb_values != 4) {
+      fprintf(stderr, "Ignoring KNL NUMA quirk, distance matrix for 8 nodes cannot contain %u different values instead of 4.\n",
+	      summary->nb_values);
+      return -1;
+    }
 
-          sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
-          cpuset = hwloc_parse_cpumap(nodepath, data->root_fd);
-          if (!cpuset) {
-	    /* This NUMA object won't be inserted, we'll ignore distances */
-	    failednodes++;
-	    continue;
-	  }
+  } else {
+    abort(); /* checked above */
+  }
 
-          node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
-          node->cpuset = cpuset;
-          node->nodeset = hwloc_bitmap_alloc();
-          hwloc_bitmap_set(node->nodeset, osnode);
-
-          hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
-
-          hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
-                                  osnode, node->cpuset);
-          res_obj = hwloc_insert_object_by_cpuset(topology, node);
-	  if (node == res_obj) {
-	    nodes[index_] = node;
-	  } else {
-	    /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
-	     * This object disappeared, we'll ignore distances */
-	    failednodes++;
-	  }
-      }
+  hwloc_debug("Summary of KNL distance matrix:\n");
+  for(k=0; k<summary->nb_values; k++)
+    hwloc_debug("  Found %u times distance %llu\n", summary->values[k].occurences, (unsigned long long) summary->values[k].value);
+  return 0;
+}
 
-      if (failednodes) {
-	/* failed to read/create some nodes, don't bother reading/fixing
-	 * a distance matrix that would likely be wrong anyway.
-	 */
-	nbnodes -= failednodes;
-	distances = NULL;
-      } else {
-	distances = calloc(nbnodes*nbnodes, sizeof(float));
-      }
+static int
+hwloc_linux_knl_identify_4nodes(uint64_t *distances,
+				struct knl_distances_summary *distsum,
+				unsigned *ddr, unsigned *mcdram) /* ddr and mcdram arrays must be 2-long */
+{
+  uint64_t value;
+  unsigned i;
 
-      if (NULL == distances) {
-          free(nodes);
-          free(indexes);
-          goto out;
-      }
+  hwloc_debug("Trying to identify 4 KNL NUMA nodes in SNC-2 cluster mode...\n");
+
+  /* The SNC2-Flat/Hybrid matrix should be something like
+   * 10 21 31 41
+   * 21 10 41 31
+   * 31 41 10 41
+   * 41 31 41 10
+   * which means there are (above 4*10 on the diagonal):
+   * 1 unique value for DDR to other DDR,
+   * 2 identical values for DDR to local MCDRAM
+   * 3 identical values for everything else.
+   */
+  if (distsum->nb_values != 4
+      || distsum->values[0].occurences != 1 /* DDR to DDR */
+      || distsum->values[1].occurences != 2 /* DDR to local MCDRAM */
+      || distsum->values[2].occurences != 3 /* others */
+      || distsum->values[3].occurences != 4 /* local */ )
+    return -1;
+
+  /* DDR:0 is always first */
+  ddr[0] = 0;
+
+  /* DDR:1 is at distance distsum->values[0].value from ddr[0] */
+  value = distsum->values[0].value;
+  ddr[1] = 0;
+  hwloc_debug("  DDR#0 is NUMAnode#0\n");
+  for(i=0; i<4; i++)
+    if (distances[i] == value) {
+      ddr[1] = i;
+      hwloc_debug("  DDR#1 is NUMAnode#%u\n", i);
+      break;
+    }
+  if (!ddr[1])
+    return -1;
 
-      /* Get actual distances now */
-      for (index_ = 0; index_ < nbnodes; index_++) {
-          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+  /* MCDRAMs are at distance distsum->values[1].value from their local DDR */
+  value = distsum->values[1].value;
+  mcdram[0] = mcdram[1] = 0;
+  for(i=1; i<4; i++) {
+    if (distances[i] == value) {
+      hwloc_debug("  MCDRAM#0 is NUMAnode#%u\n", i);
+      mcdram[0] = i;
+    } else if (distances[ddr[1]*4+i] == value) {
+      hwloc_debug("  MCDRAM#1 is NUMAnode#%u\n", i);
+      mcdram[1] = i;
+    }
+  }
+  if (!mcdram[0] || !mcdram[1])
+    return -1;
 
-	  osnode = indexes[index_];
+  return 0;
+}
 
-	  /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
-	   * store them in slots X*N...X*N+N-1 */
-          sprintf(nodepath, "%s/node%u/distance", path, osnode);
-          hwloc_parse_node_distance(nodepath, nbnodes, distances+index_*nbnodes, data->root_fd);
-      }
+static int
+hwloc_linux_knl_identify_8nodes(uint64_t *distances,
+				struct knl_distances_summary *distsum,
+				unsigned *ddr, unsigned *mcdram) /* ddr and mcdram arrays must be 4-long */
+{
+  uint64_t value;
+  unsigned i, nb;
+
+  hwloc_debug("Trying to identify 8 KNL NUMA nodes in SNC-4 cluster mode...\n");
+
+  /* The SNC4-Flat/Hybrid matrix should be something like
+   * 10 21 21 21 31 41 41 41
+   * 21 10 21 21 41 31 41 41
+   * 21 21 10 21 41 41 31 41
+   * 21 21 21 10 41 41 41 31
+   * 31 41 41 41 10 41 41 41
+   * 41 31 41 41 41 10 41 41
+   * 41 41 31 41 41 41 31 41
+   * 41 41 41 31 41 41 41 41
+   * which means there are (above 8*10 on the diagonal):
+   * 4 identical values for DDR to local MCDRAM
+   * 6 identical values for DDR to other DDR,
+   * 18 identical values for everything else.
+   */
+  if (distsum->nb_values != 4
+      || distsum->values[0].occurences != 4 /* DDR to local MCDRAM */
+      || distsum->values[1].occurences != 6 /* DDR to DDR */
+      || distsum->values[2].occurences != 8 /* local */
+      || distsum->values[3].occurences != 18 /* others */ )
+    return -1;
+
+  /* DDR:0 is always first */
+  ddr[0] = 0;
+  hwloc_debug("  DDR#0 is NUMAnode#0\n");
+
+  /* DDR:[1-3] are at distance distsum->values[1].value from ddr[0] */
+  value = distsum->values[1].value;
+  ddr[1] = ddr[2] = ddr[3] = 0;
+  nb = 1;
+  for(i=0; i<8; i++)
+    if (distances[i] == value) {
+      hwloc_debug("  DDR#%u is NUMAnode#%u\n", nb, i);
+      ddr[nb++] = i;
+      if (nb == 4)
+	break;
+    }
+  if (nb != 4 || !ddr[1] || !ddr[2] || !ddr[3])
+    return -1;
 
-      hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+  /* MCDRAMs are at distance distsum->values[0].value from their local DDR */
+  value = distsum->values[0].value;
+  mcdram[0] = mcdram[1] = mcdram[2] = mcdram[3] = 0;
+  for(i=1; i<8; i++) {
+    if (distances[i] == value) {
+      hwloc_debug("  MCDRAM#0 is NUMAnode#%u\n", i);
+      mcdram[0] = i;
+    } else if (distances[ddr[1]*8+i] == value) {
+      hwloc_debug("  MCDRAM#1 is NUMAnode#%u\n", i);
+      mcdram[1] = i;
+    } else if (distances[ddr[2]*8+i] == value) {
+      hwloc_debug("  MCDRAM#2 is NUMAnode#%u\n", i);
+      mcdram[2] = i;
+    } else if (distances[ddr[3]*8+i] == value) {
+      hwloc_debug("  MCDRAM#3 is NUMAnode#%u\n", i);
+      mcdram[3] = i;
+    }
   }
+  if (!mcdram[0] || !mcdram[1] || !mcdram[2] || !mcdram[3])
+    return -1;
 
- out:
-  *found = nbnodes;
   return 0;
 }
 
-/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
+/* Try to handle knl hwdata properties
+ * Returns 0 on success and -1 otherwise */
 static int
-look_sysfscpu(struct hwloc_topology *topology,
-	      struct hwloc_linux_backend_data_s *data,
-	      const char *path,
-	      struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
+hwloc_linux_knl_read_hwdata_properties(struct hwloc_linux_backend_data_s *data,
+				       struct knl_hwdata *hwdata)
 {
-  hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
-  hwloc_bitmap_t unknownset; /* Set of cpus to clear */
-#define CPU_TOPOLOGY_STR_LEN 128
-  char str[CPU_TOPOLOGY_STR_LEN];
-  DIR *dir;
-  int i,j;
-  FILE *fd;
-  unsigned caches_added, merge_buggy_core_siblings;
-  hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
+  char *knl_cache_file;
+  int version = 0;
+  char buffer[512] = {0};
+  char *data_beg = NULL;
 
-  /* fill the cpuset of interesting cpus */
-  dir = hwloc_opendir(path, data->root_fd);
-  if (!dir)
+  if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
     return -1;
-  else {
-    struct dirent *dirent;
-    cpuset = hwloc_bitmap_alloc();
-    unknownset = hwloc_bitmap_alloc();
 
-    while ((dirent = readdir(dir)) != NULL) {
-      unsigned long cpu;
-      char online[2];
+  hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
+  if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
+    hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
+    free(knl_cache_file);
+    return -1;
+  }
+  free(knl_cache_file);
 
-      if (strncmp(dirent->d_name, "cpu", 3))
-	continue;
-      cpu = strtoul(dirent->d_name+3, NULL, 0);
+  data_beg = &buffer[0];
 
-      /* Maybe we don't have topology information but at least it exists */
-      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
+  /* file must start with version information */
+  if (sscanf(data_beg, "version: %d", &version) != 1) {
+    fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
+    return -1;
+  }
 
-      /* check whether this processor is online */
-      sprintf(str, "%s/cpu%lu/online", path, cpu);
-      fd = hwloc_fopen(str, "r", data->root_fd);
-      if (fd) {
-	if (fgets(online, sizeof(online), fd)) {
-	  if (!atoi(online)) {
-	    fclose(fd);
-	    hwloc_debug("os proc %lu is offline\n", cpu);
-	    hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
-	    hwloc_bitmap_set(unknownset, cpu);
-	    continue;
-	  }
-	}
-	fclose(fd);
+  while (1) {
+    char *line_end = strstr(data_beg, "\n");
+    if (!line_end)
+        break;
+    if (version >= 1) {
+      if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
+          sscanf(data_beg, "cache_size: %lld", &hwdata->mcdram_cache_size);
+          hwloc_debug("read cache_size=%lld\n", hwdata->mcdram_cache_size);
+      } else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
+          sscanf(data_beg, "line_size: %d", &hwdata->mcdram_cache_line_size);
+          hwloc_debug("read line_size=%d\n", hwdata->mcdram_cache_line_size);
+      } else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
+          sscanf(data_beg, "inclusiveness: %d", &hwdata->mcdram_cache_inclusiveness);
+          hwloc_debug("read inclusiveness=%d\n", hwdata->mcdram_cache_inclusiveness);
+      } else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
+          sscanf(data_beg, "associativity: %d\n", &hwdata->mcdram_cache_associativity);
+          hwloc_debug("read associativity=%d\n", hwdata->mcdram_cache_associativity);
       }
-
-      /* check whether the kernel exports topology information for this cpu */
-      sprintf(str, "%s/cpu%lu/topology", path, cpu);
-      if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
-	hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
-		   cpu, path, cpu);
-	hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
-	hwloc_bitmap_set(unknownset, cpu);
-	continue;
+    }
+    if (version >= 2) {
+      if (!strncmp("cluster_mode: ", data_beg, strlen("cluster_mode: "))) {
+	size_t length;
+	data_beg += strlen("cluster_mode: ");
+	length = line_end-data_beg;
+	if (length > sizeof(hwdata->cluster_mode)-1)
+	  length = sizeof(hwdata->cluster_mode)-1;
+	memcpy(hwdata->cluster_mode, data_beg, length);
+	hwdata->cluster_mode[length] = '\0';
+        hwloc_debug("read cluster_mode=%s\n", hwdata->cluster_mode);
+      } else if (!strncmp("memory_mode: ", data_beg, strlen("memory_mode: "))) {
+	size_t length;
+	data_beg += strlen("memory_mode: ");
+	length = line_end-data_beg;
+	if (length > sizeof(hwdata->memory_mode)-1)
+	  length = sizeof(hwdata->memory_mode)-1;
+	memcpy(hwdata->memory_mode, data_beg, length);
+	hwdata->memory_mode[length] = '\0';
+        hwloc_debug("read memory_mode=%s\n", hwdata->memory_mode);
       }
-
-      hwloc_bitmap_set(cpuset, cpu);
     }
-    closedir(dir);
+
+    data_beg = line_end + 1;
   }
 
-  topology->support.discovery->pu = 1;
-  hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
-	     hwloc_bitmap_weight(cpuset), cpuset);
+  if (hwdata->mcdram_cache_size == -1
+      || hwdata->mcdram_cache_line_size == -1
+      || hwdata->mcdram_cache_associativity == -1
+      || hwdata->mcdram_cache_inclusiveness == -1) {
+    hwloc_debug("Incorrect file format cache_size=%lld line_size=%d associativity=%d inclusiveness=%d\n",
+		hwdata->mcdram_cache_size,
+		hwdata->mcdram_cache_line_size,
+		hwdata->mcdram_cache_associativity,
+		hwdata->mcdram_cache_inclusiveness);
+    hwdata->mcdram_cache_size = -1; /* mark cache as invalid */
+  }
 
-  merge_buggy_core_siblings = (!strcmp(data->utsname.machine, "x86_64"))
-			   || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"));
-  caches_added = 0;
-  hwloc_bitmap_foreach_begin(i, cpuset)
-    {
-      hwloc_bitmap_t packageset, coreset, bookset, threadset, savedcoreset;
-      unsigned mypackageid, mycoreid, mybookid;
-      //int threadwithcoreid = 0;
-      int threadwithcoreid = data->is_amd_with_CU ? -1 : 0;
+  return 0;
+}
 
-      /* look at the package */
-      mypackageid = 0; /* shut-up the compiler */
-      sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i);
-      hwloc_parse_sysfs_unsigned(str, &mypackageid, data->root_fd);
+static void
+hwloc_linux_knl_guess_hwdata_properties(struct knl_hwdata *hwdata,
+					hwloc_obj_t *nodes, unsigned nbnodes,
+					struct knl_distances_summary *distsum)
+{
+  /* Try to guess KNL configuration (Cluster mode, Memory mode, and MCDRAM cache info)
+   * from the NUMA configuration (number of nodes, CPUless or not, distances).
+   * Keep in mind that some CPUs might be offline (hence DDR could be CPUless too.
+   * Keep in mind that part of the memory might be offline (hence MCDRAM could contain less than 16GB total).
+   */
 
-      sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
-      packageset = hwloc_parse_cpumap(str, data->root_fd);
-      if (packageset) {
-       hwloc_bitmap_andnot(packageset, packageset, unknownset);
-       if (hwloc_bitmap_first(packageset) == i) {
-        /* first cpu in this package, add the package */
-	struct hwloc_obj *package;
-
-	if (merge_buggy_core_siblings) {
-	  /* check for another package with same physical_package_id */
-	  hwloc_obj_t curpackage = packages;
-	  while (curpackage) {
-	    if (curpackage->os_index == mypackageid) {
-	      /* found another package with same physical_package_id but different core_siblings.
-	       * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
-	       * merge these core_siblings to extend the existing first package object.
-	       */
-	      static int reported = 0;
-	      if (!reported && !hwloc_hide_errors()) {
-		char *a, *b;
-		hwloc_bitmap_asprintf(&a, curpackage->cpuset);
-		hwloc_bitmap_asprintf(&b, packageset);
-		fprintf(stderr, "****************************************************************************\n");
-		fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
-		fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
-			mypackageid, a, b);
-		fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
-		fprintf(stderr, "* does not support this processor correctly.\n");
-		fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
-	        fprintf(stderr, "*\n");
-		fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
-		fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
-		fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
-		fprintf(stderr, "****************************************************************************\n");
-		reported = 1;
-		free(a);
-		free(b);
-	      }
-	      hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
-	      goto package_done;
-	    }
-	    curpackage = curpackage->next_cousin;
-	  }
-	}
+  hwloc_debug("Trying to guess missing KNL configuration information...\n");
 
-	/* no package with same physical_package_id, create a new one */
-	package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
-	package->cpuset = packageset;
-	hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
-				mypackageid, packageset);
-	/* add cpuinfo */
-	if (cpuinfo_Lprocs) {
-	  for(j=0; j<(int) cpuinfo_numprocs; j++)
-	    if ((int) cpuinfo_Lprocs[j].Pproc == i) {
-	      hwloc__move_infos(&package->infos, &package->infos_count,
-				&cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
-	    }
-	}
-	/* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
-	 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
-	 */
-	package->next_cousin = packages;
-	packages = package;
+  /* These MCDRAM cache attributes are always valid.
+   * We'll only use them if mcdram_cache_size > 0
+   */
+  hwdata->mcdram_cache_associativity = 1;
+  hwdata->mcdram_cache_inclusiveness = 1;
+  hwdata->mcdram_cache_line_size = 64;
+  /* All commercial KNL/KNM have 16GB of MCDRAM, we'll divide that in the number of SNC */
+
+  if (hwdata->mcdram_cache_size > 0
+      && hwdata->cluster_mode[0]
+      && hwdata->memory_mode[0])
+    /* Nothing to guess */
+    return;
 
-	packageset = NULL; /* don't free it */
-       }
-      }
-package_done:
-      hwloc_bitmap_free(packageset);
+  /* Quadrant/All2All/Hemisphere are basically identical from the application point-of-view,
+   * and Quadrant is recommended (except if underpopulating DIMMs).
+   * Hence we'll assume Quadrant when unknown.
+   */
 
-      /* look at the core */
-      mycoreid = 0; /* shut-up the compiler */
-      sprintf(str, "%s/cpu%d/topology/core_id", path, i);
-      hwloc_parse_sysfs_unsigned(str, &mycoreid, data->root_fd);
+  /* Flat/Hybrid25/Hybrid50 cannot be distinguished unless we know the Cache size
+   * (if running a old hwloc-dump-hwdata that reports Cache size without modes)
+   * or we're sure MCDRAM NUMAnode size was not decreased by offlining some memory.
+   * Hence we'll assume Flat when unknown.
+   */
 
-      sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
-      coreset = hwloc_parse_cpumap(str, data->root_fd);
-      savedcoreset = coreset; /* store it for later work-arounds */
-      if (coreset) {
-       hwloc_bitmap_andnot(coreset, coreset, unknownset);
-       if (hwloc_bitmap_weight(coreset) > 1) {
-	/* check if this is hyper-threading or different coreids */
-	unsigned siblingid, siblingcoreid;
-	hwloc_bitmap_t set = hwloc_bitmap_dup(coreset);
-	hwloc_bitmap_clr(set, i);
-	siblingid = hwloc_bitmap_first(set);
-	siblingcoreid = mycoreid;
-	sprintf(str, "%s/cpu%d/topology/core_id", path, siblingid);
-	hwloc_parse_sysfs_unsigned(str, &siblingcoreid, data->root_fd);
-	//threadwithcoreid = (siblingcoreid != mycoreid);
-	hwloc_bitmap_free(set);
-       }
-       if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
-	/* regular core */
-        struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
-	if (threadwithcoreid) {
-	  /* amd multicore compute-unit, create one core per thread */
-	  core->cpuset = hwloc_bitmap_alloc();
-	  hwloc_bitmap_set(core->cpuset, i);
-	} else {
-	  core->cpuset = coreset;
+  if (nbnodes == 1) {
+    /* Quadrant-Cache */
+    if (!hwdata->cluster_mode[0])
+      strcpy(hwdata->cluster_mode, "Quadrant");
+    if (!hwdata->memory_mode[0])
+      strcpy(hwdata->memory_mode, "Cache");
+    if (hwdata->mcdram_cache_size <= 0)
+      hwdata->mcdram_cache_size = 16UL*1024*1024*1024;
+
+  } else if (nbnodes == 2) {
+    /* most likely Quadrant-Flat/Hybrid,
+     * or SNC2/Cache (unlikely)
+     */
+
+    if (!strcmp(hwdata->memory_mode, "Cache")
+	|| !strcmp(hwdata->cluster_mode, "SNC2")
+	|| !hwloc_bitmap_iszero(nodes[1]->cpuset)) { /* MCDRAM cannot be nodes[0], and its cpuset is always empty */
+      /* SNC2-Cache */
+      if (!hwdata->cluster_mode[0])
+	strcpy(hwdata->cluster_mode, "SNC2");
+      if (!hwdata->memory_mode[0])
+	strcpy(hwdata->memory_mode, "Cache");
+      if (hwdata->mcdram_cache_size <= 0)
+	hwdata->mcdram_cache_size = 8UL*1024*1024*1024;
+
+    } else {
+      /* Assume Quadrant-Flat/Hybrid.
+       * Could also be SNC2-Cache with offline CPUs in nodes[1] (unlikely).
+       */
+      if (!hwdata->cluster_mode[0])
+	strcpy(hwdata->cluster_mode, "Quadrant");
+      if (!hwdata->memory_mode[0]) {
+	if (hwdata->mcdram_cache_size == 4UL*1024*1024*1024)
+	  strcpy(hwdata->memory_mode, "Hybrid25");
+	else if (hwdata->mcdram_cache_size == 8UL*1024*1024*1024)
+	  strcpy(hwdata->memory_mode, "Hybrid50");
+	else
+	  strcpy(hwdata->memory_mode, "Flat");
+      } else {
+	if (hwdata->mcdram_cache_size <= 0) {
+	  if (!strcmp(hwdata->memory_mode, "Hybrid25"))
+	    hwdata->mcdram_cache_size = 4UL*1024*1024*1024;
+	  else if (!strcmp(hwdata->memory_mode, "Hybrid50"))
+	    hwdata->mcdram_cache_size = 8UL*1024*1024*1024;
 	}
-        hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
-                     mycoreid, coreset);
-        hwloc_insert_object_by_cpuset(topology, core);
-        coreset = NULL; /* don't free it */
-       }
       }
+    }
 
-      /* look at the books */
-      mybookid = 0; /* shut-up the compiler */
-      sprintf(str, "%s/cpu%d/topology/book_id", path, i);
-      if (hwloc_parse_sysfs_unsigned(str, &mybookid, data->root_fd) == 0) {
-        sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
-        bookset = hwloc_parse_cpumap(str, data->root_fd);
-	if (bookset) {
-	 hwloc_bitmap_andnot(bookset, bookset, unknownset);
-         if (bookset && hwloc_bitmap_first(bookset) == i) {
-          struct hwloc_obj *book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
-          book->cpuset = bookset;
-          hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
-                       mybookid, bookset);
-          hwloc_obj_add_info(book, "Type", "Book");
-          hwloc_insert_object_by_cpuset(topology, book);
-          bookset = NULL; /* don't free it */
-	 }
-        }
-      }
+  } else if (nbnodes == 4) {
+    /* most likely SNC4-Cache
+     * or SNC2-Flat/Hybrid (unlikely)
+     *
+     * SNC2-Flat/Hybrid has 4 different values in distsum,
+     * while SNC4-Cache only has 2.
+     */
 
-      {
-      /* look at the thread */
-      struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
-      threadset = hwloc_bitmap_alloc();
-      hwloc_bitmap_only(threadset, i);
-      thread->cpuset = threadset;
-      hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
-		 i, threadset);
-      hwloc_insert_object_by_cpuset(topology, thread);
+    if (!strcmp(hwdata->cluster_mode, "SNC2") || distsum->nb_values == 4) {
+      /* SNC2-Flat/Hybrid */
+      if (!hwdata->cluster_mode[0])
+	strcpy(hwdata->cluster_mode, "SNC2");
+      if (!hwdata->memory_mode[0]) {
+	if (hwdata->mcdram_cache_size == 2UL*1024*1024*1024)
+	  strcpy(hwdata->memory_mode, "Hybrid25");
+	else if (hwdata->mcdram_cache_size == 4UL*1024*1024*1024)
+	  strcpy(hwdata->memory_mode, "Hybrid50");
+	else
+	  strcpy(hwdata->memory_mode, "Flat");
+      } else {
+	if (hwdata->mcdram_cache_size <= 0) {
+	  if (!strcmp(hwdata->memory_mode, "Hybrid25"))
+	    hwdata->mcdram_cache_size = 2UL*1024*1024*1024;
+	  else if (!strcmp(hwdata->memory_mode, "Hybrid50"))
+	    hwdata->mcdram_cache_size = 4UL*1024*1024*1024;
+	}
       }
 
-      /* look at the caches */
-      for(j=0; j<10; j++) {
-#define SHARED_CPU_MAP_STRLEN 128
-	char mappath[SHARED_CPU_MAP_STRLEN];
-	char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
-	hwloc_bitmap_t cacheset;
-	unsigned long kB = 0;
-	unsigned linesize = 0;
-	unsigned sets = 0, lines_per_tag = 1;
-	int depth; /* 0 for L1, .... */
-	hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
-
-	/* get the cache level depth */
-	sprintf(mappath, "%s/cpu%d/cache/index%d/level", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  char *res = fgets(str2,sizeof(str2), fd);
-	  fclose(fd);
-	  if (res)
-	    depth = strtoul(str2, NULL, 10)-1;
-	  else
-	    continue;
-	} else
-	  continue;
+    } else {
+      /* Assume SNC4-Cache.
+       * SNC2 is unlikely.
+       */
+      if (!hwdata->cluster_mode[0])
+	strcpy(hwdata->cluster_mode, "SNC4");
+      if (!hwdata->memory_mode[0])
+	strcpy(hwdata->memory_mode, "Cache");
+      if (hwdata->mcdram_cache_size <= 0)
+	hwdata->mcdram_cache_size = 4UL*1024*1024*1024;
+    }
 
-	/* cache type */
-	sprintf(mappath, "%s/cpu%d/cache/index%d/type", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  if (fgets(str2, sizeof(str2), fd)) {
-	    fclose(fd);
-	    if (!strncmp(str2, "Data", 4))
-	      type = HWLOC_OBJ_CACHE_DATA;
-	    else if (!strncmp(str2, "Unified", 7))
-	      type = HWLOC_OBJ_CACHE_UNIFIED;
-	    else if (!strncmp(str2, "Instruction", 11))
-	      type = HWLOC_OBJ_CACHE_INSTRUCTION;
-	    else
-	      continue;
-	  } else {
-	    fclose(fd);
-	    continue;
-	  }
-	} else
-	  continue;
+  } else if (nbnodes == 8) {
+    /* SNC4-Flat/Hybrid */
 
-	/* get the cache size */
-	sprintf(mappath, "%s/cpu%d/cache/index%d/size", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  if (fgets(str2,sizeof(str2), fd))
-	    kB = atol(str2); /* in kB */
-	  fclose(fd);
-	}
+    if (!hwdata->cluster_mode[0])
+      strcpy(hwdata->cluster_mode, "SNC4");
+    if (!hwdata->memory_mode[0]) {
+      if (hwdata->mcdram_cache_size == 1UL*1024*1024*1024)
+	strcpy(hwdata->memory_mode, "Hybrid25");
+      else if (hwdata->mcdram_cache_size == 2UL*1024*1024*1024)
+	strcpy(hwdata->memory_mode, "Hybrid50");
+      else
+	strcpy(hwdata->memory_mode, "Flat");
+    } else {
+      if (hwdata->mcdram_cache_size <= 0) {
+	if (!strcmp(hwdata->memory_mode, "Hybrid25"))
+	  hwdata->mcdram_cache_size = 1UL*1024*1024*1024;
+	else if (!strcmp(hwdata->memory_mode, "Hybrid50"))
+	  hwdata->mcdram_cache_size = 2UL*1024*1024*1024;
+      }
+    }
+  }
 
-	/* get the line size */
-	sprintf(mappath, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  if (fgets(str2,sizeof(str2), fd))
-	    linesize = atol(str2); /* in bytes */
-	  fclose(fd);
-	}
+  hwloc_debug("  Found cluster=%s memory=%s cache=%lld\n",
+	      hwdata->cluster_mode, hwdata->memory_mode,
+	      hwdata->mcdram_cache_size);
+}
 
-	/* get the number of sets and lines per tag.
-	 * don't take the associativity directly in "ways_of_associativity" because
-	 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
-	 */
-	sprintf(mappath, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  if (fgets(str2,sizeof(str2), fd))
-	    sets = atol(str2);
-	  fclose(fd);
-	}
-	sprintf(mappath, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j);
-	fd = hwloc_fopen(mappath, "r", data->root_fd);
-	if (fd) {
-	  if (fgets(str2,sizeof(str2), fd))
-	    lines_per_tag = atol(str2);
-	  fclose(fd);
-	}
+static void
+hwloc_linux_knl_add_cluster(struct hwloc_topology *topology,
+			    hwloc_obj_t ddr, hwloc_obj_t mcdram,
+			    struct knl_hwdata *knl_hwdata,
+			    int mscache_as_l3,
+			    unsigned *failednodes)
+{
+  hwloc_obj_t cluster = NULL;
 
-	sprintf(mappath, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
-	cacheset = hwloc_parse_cpumap(mappath, data->root_fd);
-        if (cacheset) {
-	  hwloc_bitmap_andnot(cacheset, cacheset, unknownset);
-          if (hwloc_bitmap_weight(cacheset) < 1) {
-            /* mask is wrong (useful for many itaniums) */
-            if (savedcoreset)
-              /* assume it's a core-specific cache */
-              hwloc_bitmap_copy(cacheset, savedcoreset);
-            else
-              /* assumes it's not shared */
-              hwloc_bitmap_only(cacheset, i);
-          }
-
-          if (hwloc_bitmap_first(cacheset) == i) {
-            /* first cpu in this cache, add the cache */
-            struct hwloc_obj *cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
-            cache->attr->cache.size = kB << 10;
-            cache->attr->cache.depth = depth+1;
-            cache->attr->cache.linesize = linesize;
-	    cache->attr->cache.type = type;
-	    if (!linesize || !lines_per_tag || !sets)
-	      cache->attr->cache.associativity = 0; /* unknown */
-	    else if (sets == 1)
-	      cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
-	    else
-	      cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
-            cache->cpuset = cacheset;
-            hwloc_debug_1arg_bitmap("cache depth %d has cpuset %s\n",
-                       depth, cacheset);
-            hwloc_insert_object_by_cpuset(topology, cache);
-            cacheset = NULL; /* don't free it */
-            ++caches_added;
-          }
-        }
-        hwloc_bitmap_free(cacheset);
-      }
-      hwloc_bitmap_free(coreset);
+  if (mcdram) {
+    mcdram->subtype = strdup("MCDRAM");
+    /* Change MCDRAM cpuset to DDR cpuset for clarity.
+     * Not actually useful if we insert with hwloc__attach_memory_object() below.
+     * The cpuset will be updated by the core later anyway.
+     */
+    hwloc_bitmap_copy(mcdram->cpuset, ddr->cpuset);
+
+    /* Add a Group for Cluster containing this MCDRAM + DDR */
+    cluster = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+    hwloc_obj_add_other_obj_sets(cluster, ddr);
+    hwloc_obj_add_other_obj_sets(cluster, mcdram);
+    cluster->subtype = strdup("Cluster");
+    cluster->attr->group.kind = HWLOC_GROUP_KIND_INTEL_KNL_SUBNUMA_CLUSTER;
+    cluster = hwloc__insert_object_by_cpuset(topology, NULL, cluster, hwloc_report_os_error);
+  }
+
+  if (cluster) {
+    /* Now insert NUMA nodes below this cluster */
+    hwloc_obj_t res;
+    res = hwloc__attach_memory_object(topology, cluster, ddr, hwloc_report_os_error);
+    if (res != ddr) {
+      (*failednodes)++;
+      ddr = NULL;
     }
-  hwloc_bitmap_foreach_end();
+    res = hwloc__attach_memory_object(topology, cluster, mcdram, hwloc_report_os_error);
+    if (res != mcdram)
+      (*failednodes)++;
 
-  /* actually insert in the tree now that package cpusets have been fixed-up */
-  while (packages) {
-    hwloc_obj_t next = packages->next_cousin;
-    packages->next_cousin = NULL;
-    hwloc_insert_object_by_cpuset(topology, packages);
-    packages = next;
+  } else {
+    /* we don't know where to attach, let the core find or insert if needed */
+    hwloc_obj_t res;
+    res = hwloc__insert_object_by_cpuset(topology, NULL, ddr, hwloc_report_os_error);
+    if (res != ddr) {
+      (*failednodes)++;
+      ddr = NULL;
+    }
+    if (mcdram) {
+      res = hwloc__insert_object_by_cpuset(topology, NULL, mcdram, hwloc_report_os_error);
+      if (res != mcdram)
+	(*failednodes)++;
+    }
   }
 
-  if (0 == caches_added)
-    look_powerpc_device_tree(topology, data);
+  if (ddr && knl_hwdata->mcdram_cache_size > 0) {
+    /* Now insert the mscache if any */
+    hwloc_obj_t cache = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L3CACHE, HWLOC_UNKNOWN_INDEX);
+    if (!cache)
+      /* failure is harmless */
+      return;
+    cache->attr->cache.depth = 3;
+    cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    cache->attr->cache.size = knl_hwdata->mcdram_cache_size;
+    cache->attr->cache.linesize = knl_hwdata->mcdram_cache_line_size;
+    cache->attr->cache.associativity = knl_hwdata->mcdram_cache_associativity;
+    hwloc_obj_add_info(cache, "Inclusive", knl_hwdata->mcdram_cache_inclusiveness ? "1" : "0");
+    cache->cpuset = hwloc_bitmap_dup(ddr->cpuset);
+    cache->nodeset = hwloc_bitmap_dup(ddr->nodeset); /* only applies to DDR */
+    if (mscache_as_l3) {
+      /* make it a L3 */
+      cache->subtype = strdup("MemorySideCache");
+      hwloc_insert_object_by_cpuset(topology, cache);
+    } else {
+      /* make it a real mscache */
+      cache->type = HWLOC_OBJ_MEMCACHE;
+      if (cluster)
+	hwloc__attach_memory_object(topology, cluster, cache, hwloc_report_os_error);
+      else
+	hwloc__insert_object_by_cpuset(topology, NULL, cache, hwloc_report_os_error);
+    }
+  }
+}
 
-  hwloc_bitmap_free(cpuset);
-  hwloc_bitmap_free(unknownset);
+static void
+hwloc_linux_knl_numa_quirk(struct hwloc_topology *topology,
+			   struct hwloc_linux_backend_data_s *data,
+			   hwloc_obj_t *nodes, unsigned nbnodes,
+			   uint64_t * distances,
+			   unsigned *failednodes)
+{
+  struct knl_hwdata hwdata;
+  struct knl_distances_summary dist;
+  unsigned i;
+  char * fallback_env = getenv("HWLOC_KNL_HDH_FALLBACK");
+  int fallback = fallback_env ? atoi(fallback_env) : -1; /* by default, only fallback if needed */
+  char * mscache_as_l3_env = getenv("HWLOC_KNL_MSCACHE_L3");
+  int mscache_as_l3 = mscache_as_l3_env ? atoi(mscache_as_l3_env) : 1; /* L3 by default, for backward compat */
+
+  if (*failednodes)
+    goto error;
+
+  if (hwloc_linux_knl_parse_numa_distances(nbnodes, distances, &dist) < 0)
+    goto error;
+
+  hwdata.memory_mode[0] = '\0';
+  hwdata.cluster_mode[0] = '\0';
+  hwdata.mcdram_cache_size = -1;
+  hwdata.mcdram_cache_associativity = -1;
+  hwdata.mcdram_cache_inclusiveness = -1;
+  hwdata.mcdram_cache_line_size = -1;
+  if (fallback == 1)
+    hwloc_debug("KNL dumped hwdata ignored, forcing fallback to heuristics\n");
+  else
+    hwloc_linux_knl_read_hwdata_properties(data, &hwdata);
+  if (fallback != 0)
+    hwloc_linux_knl_guess_hwdata_properties(&hwdata, nodes, nbnodes, &dist);
+
+  if (strcmp(hwdata.cluster_mode, "All2All")
+      && strcmp(hwdata.cluster_mode, "Hemisphere")
+      && strcmp(hwdata.cluster_mode, "Quadrant")
+      && strcmp(hwdata.cluster_mode, "SNC2")
+      && strcmp(hwdata.cluster_mode, "SNC4")) {
+    fprintf(stderr, "Failed to find a usable KNL cluster mode (%s)\n", hwdata.cluster_mode);
+    goto error;
+  }
+  if (strcmp(hwdata.memory_mode, "Cache")
+      && strcmp(hwdata.memory_mode, "Flat")
+      && strcmp(hwdata.memory_mode, "Hybrid25")
+      && strcmp(hwdata.memory_mode, "Hybrid50")) {
+    fprintf(stderr, "Failed to find a usable KNL memory mode (%s)\n", hwdata.memory_mode);
+    goto error;
+  }
+
+  if (mscache_as_l3) {
+    if (!hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L3CACHE))
+      hwdata.mcdram_cache_size = 0;
+  } else {
+    if (!hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_MEMCACHE))
+      hwdata.mcdram_cache_size = 0;
+  }
 
-  return 0;
-}
+  hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", hwdata.cluster_mode);
+  hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", hwdata.memory_mode);
 
+  if (!strcmp(hwdata.cluster_mode, "All2All")
+      || !strcmp(hwdata.cluster_mode, "Hemisphere")
+      || !strcmp(hwdata.cluster_mode, "Quadrant")) {
+    if (!strcmp(hwdata.memory_mode, "Cache")) {
+      /* Quadrant-Cache */
+      if (nbnodes != 1) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 1 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      hwloc_linux_knl_add_cluster(topology, nodes[0], NULL, &hwdata, mscache_as_l3, failednodes);
 
+    } else {
+      /* Quadrant-Flat/Hybrid */
+      if (nbnodes != 2) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 2 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      if (!strcmp(hwdata.memory_mode, "Flat"))
+	hwdata.mcdram_cache_size = 0;
+      hwloc_linux_knl_add_cluster(topology, nodes[0], nodes[1], &hwdata, mscache_as_l3, failednodes);
+    }
 
-/****************************************
- ****** cpuinfo Topology Discovery ******
- ****************************************/
+  } else if (!strcmp(hwdata.cluster_mode, "SNC2")) {
+    if (!strcmp(hwdata.memory_mode, "Cache")) {
+      /* SNC2-Cache */
+      if (nbnodes != 2) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 2 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      hwloc_linux_knl_add_cluster(topology, nodes[0], NULL, &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[1], NULL, &hwdata, mscache_as_l3, failednodes);
 
-static int
-hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
-			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
-			      int is_global __hwloc_attribute_unused)
-{
-  if (!strcmp("vendor_id", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUVendor", value);
-  } else if (!strcmp("model name", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUModel", value);
-  } else if (!strcmp("model", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
-  } else if (!strcmp("cpu family", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
-  } else if (!strcmp("stepping", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUStepping", value);
+    } else {
+      /* SNC2-Flat/Hybrid */
+      unsigned ddr[2], mcdram[2];
+      if (nbnodes != 4) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 2 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      if (hwloc_linux_knl_identify_4nodes(distances, &dist, ddr, mcdram) < 0) {
+	fprintf(stderr, "Unexpected distance layout for mode %s-%s\n", hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      if (!strcmp(hwdata.memory_mode, "Flat"))
+	hwdata.mcdram_cache_size = 0;
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[0]], nodes[mcdram[0]], &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[1]], nodes[mcdram[1]], &hwdata, mscache_as_l3, failednodes);
+    }
+
+  } else if (!strcmp(hwdata.cluster_mode, "SNC4")) {
+    if (!strcmp(hwdata.memory_mode, "Cache")) {
+      /* SNC4-Cache */
+      if (nbnodes != 4) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 4 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      hwloc_linux_knl_add_cluster(topology, nodes[0], NULL, &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[1], NULL, &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[2], NULL, &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[3], NULL, &hwdata, mscache_as_l3, failednodes);
+
+    } else {
+      /* SNC4-Flat/Hybrid */
+      unsigned ddr[4], mcdram[4];
+      if (nbnodes != 8) {
+	fprintf(stderr, "Found %u NUMA nodes instead of 2 in mode %s-%s\n", nbnodes, hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      if (hwloc_linux_knl_identify_8nodes(distances, &dist, ddr, mcdram) < 0) {
+	fprintf(stderr, "Unexpected distance layout for mode %s-%s\n", hwdata.cluster_mode, hwdata.memory_mode);
+	goto error;
+      }
+      if (!strcmp(hwdata.memory_mode, "Flat"))
+	hwdata.mcdram_cache_size = 0;
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[0]], nodes[mcdram[0]], &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[1]], nodes[mcdram[1]], &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[2]], nodes[mcdram[2]], &hwdata, mscache_as_l3, failednodes);
+      hwloc_linux_knl_add_cluster(topology, nodes[ddr[3]], nodes[mcdram[3]], &hwdata, mscache_as_l3, failednodes);
+    }
   }
-  return 0;
-}
 
-static int
-hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
-			       struct hwloc_obj_info_s **infos, unsigned *infos_count,
-			       int is_global __hwloc_attribute_unused)
-{
-  if (!strcmp("vendor", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUVendor", value);
-  } else if (!strcmp("model name", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUModel", value);
-  } else if (!strcmp("model", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
-  } else if (!strcmp("family", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  return;
+
+ error:
+  /* just insert nodes basically */
+  for (i = 0; i < nbnodes; i++) {
+    hwloc_obj_t node = nodes[i];
+    if (node) {
+      hwloc_obj_t res_obj = hwloc__insert_object_by_cpuset(topology, NULL, node, hwloc_report_os_error);
+      if (res_obj != node)
+	/* This NUMA node got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
+	 * This object disappeared, we'll ignore distances */
+	(*failednodes)++;
+    }
   }
-  return 0;
 }
 
-static int
-hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
-			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
-			      int is_global __hwloc_attribute_unused)
-{
-  if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
-      || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
-    hwloc__add_info(infos, infos_count, "CPUModel", value);
-  } else if (!strcmp("CPU implementer", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUImplementer", value);
-  } else if (!strcmp("CPU architecture", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
-  } else if (!strcmp("CPU variant", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUVariant", value);
-  } else if (!strcmp("CPU part", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUPart", value);
-  } else if (!strcmp("CPU revision", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPURevision", value);
-  } else if (!strcmp("Hardware", prefix)) {
-    hwloc__add_info(infos, infos_count, "HardwareName", value);
-  } else if (!strcmp("Revision", prefix)) {
-    hwloc__add_info(infos, infos_count, "HardwareRevision", value);
-  } else if (!strcmp("Serial", prefix)) {
-    hwloc__add_info(infos, infos_count, "HardwareSerial", value);
-  }
-  return 0;
-}
 
+/**************************************
+ ****** Sysfs Topology Discovery ******
+ **************************************/
+
+/* try to find locality of CPU-less NUMA nodes by looking at their distances */
 static int
-hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
-			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
-			      int is_global)
+fixup_cpuless_node_locality_from_distances(unsigned i,
+					   unsigned nbnodes, hwloc_obj_t *nodes, uint64_t *distances)
 {
-  /* common fields */
-  if (!strcmp("cpu", prefix)) {
-    hwloc__add_info(infos, infos_count, "CPUModel", value);
-  } else if (!strcmp("platform", prefix)) {
-    hwloc__add_info(infos, infos_count, "PlatformName", value);
-  } else if (!strcmp("model", prefix)) {
-    hwloc__add_info(infos, infos_count, "PlatformModel", value);
-  }
-  /* platform-specific fields */
-  else if (!strcasecmp("vendor", prefix)) {
-    hwloc__add_info(infos, infos_count, "PlatformVendor", value);
-  } else if (!strcmp("Board ID", prefix)) {
-    hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
-  } else if (!strcmp("Board", prefix)
-	     || !strcasecmp("Machine", prefix)) {
-    /* machine and board are similar (and often more precise) than model above */
-    char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
-    if (*valuep)
-      free(*valuep);
-    *valuep = strdup(value);
-  } else if (!strcasecmp("Revision", prefix)
-	     || !strcmp("Hardware rev", prefix)) {
-    hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
-  } else if (!strcmp("SVR", prefix)) {
-    hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
-  } else if (!strcmp("PVR", prefix)) {
-    hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
+  unsigned min = UINT_MAX;
+  unsigned nb = 0, j;
+
+  for(j=0; j<nbnodes; j++) {
+    if (j==i || !nodes[j])
+      continue;
+    if (distances[i*nbnodes+j] < min) {
+      min = distances[i*nbnodes+j];
+      nb = 1;
+    } else if (distances[i*nbnodes+j] == min) {
+      nb++;
+    }
   }
-  /* don't match 'board*' because there's also "board l2" on some platforms */
+
+  if (min <= distances[i*nbnodes+i] || min == UINT_MAX || nb == nbnodes-1)
+    return -1;
+
+  /* not local, but closer to *some* other nodes */
+  for(j=0; j<nbnodes; j++)
+    if (j!=i && nodes[j] && distances[i*nbnodes+j] == min)
+      hwloc_bitmap_or(nodes[i]->cpuset, nodes[i]->cpuset, nodes[j]->cpuset);
   return 0;
 }
 
-/*
- * avr32: "chip type\t:"			=> OK
- * blackfin: "model name\t:"			=> OK
- * h8300: "CPU:"				=> OK
- * m68k: "CPU:"					=> OK
- * mips: "cpu model\t\t:"			=> OK
- * openrisc: "CPU:"				=> OK
- * sparc: "cpu\t\t:"				=> OK
- * tile: "model name\t:"			=> OK
- * unicore32: "Processor\t:"			=> OK
- * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:"	=> "cpu" overwritten by "cpu model", no processor indexes
- * cris: "cpu\t\t:" + "cpu model\t:"		=> only "cpu"
- * frv: "CPU-Core:" + "CPU:"			=> only "CPU"
- * mn10300: "cpu core   :" + "model name :"	=> only "model name"
- * parisc: "cpu family\t:" + "cpu\t\t:"		=> only "cpu"
+/* try to find locality of CPU-less NUMA nodes by looking at HMAT initiators.
  *
- * not supported because of conflicts with other arch minor lines:
- * m32r: "cpu family\t:"			=> KO (adding "cpu family" would break "blackfin")
- * microblaze: "CPU-Family:"			=> KO
- * sh: "cpu family\t:" + "cpu type\t:"		=> KO
- * xtensa: "model\t\t:"				=> KO
+ * In theory, we may have HMAT info only for some nodes.
+ * In practice, if this never occurs, we may want to assume HMAT for either all or no nodes.
  */
 static int
-hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
-				  struct hwloc_obj_info_s **infos, unsigned *infos_count,
-				  int is_global __hwloc_attribute_unused)
+read_node_initiators(struct hwloc_linux_backend_data_s *data,
+		     hwloc_obj_t node, unsigned nbnodes, hwloc_obj_t *nodes,
+		     const char *path)
 {
-  if (!strcmp("model name", prefix)
-      || !strcmp("Processor", prefix)
-      || !strcmp("chip type", prefix)
-      || !strcmp("cpu model", prefix)
-      || !strcasecmp("cpu", prefix)) {
-    /* keep the last one, assume it's more precise than the first one.
-     * we should have the Architecture keypair for basic information anyway.
-     */
-    char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
-    if (*valuep)
-      free(*valuep);
-    *valuep = strdup(value);
+  char accesspath[SYSFS_NUMA_NODE_PATH_LEN];
+  DIR *dir;
+  struct dirent *dirent;
+
+  sprintf(accesspath, "%s/node%u/access0/initiators", path, node->os_index);
+  dir = hwloc_opendir(accesspath, data->root_fd);
+  if (!dir)
+    return -1;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    unsigned initiator_os_index;
+    if (sscanf(dirent->d_name, "node%u", &initiator_os_index) == 1
+	&& initiator_os_index != node->os_index) {
+      /* we found an initiator that's not ourself,
+       * find the corresponding node and add its cpuset
+       */
+      unsigned j;
+      for(j=0; j<nbnodes; j++)
+	if (nodes[j] && nodes[j]->os_index == initiator_os_index) {
+	  hwloc_bitmap_or(node->cpuset, node->cpuset, nodes[j]->cpuset);
+	  break;
+	}
+    }
   }
+  closedir(dir);
   return 0;
 }
 
+/* return -1 if the kernel doesn't support mscache,
+ * or update tree (containing only the node on input) with caches (if any)
+ */
 static int
-hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
-			  const char *path,
-			  struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
-			  struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
+read_node_mscaches(struct hwloc_topology *topology,
+		   struct hwloc_linux_backend_data_s *data,
+		   const char *path,
+		   hwloc_obj_t *treep)
 {
-  FILE *fd;
-  char *str = NULL;
-  char *endptr;
-  unsigned len;
-  unsigned allocated_Lprocs = 0;
-  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
-  unsigned numprocs = 0;
-  int curproc = -1;
-  int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
+  hwloc_obj_t tree = *treep, node = tree;
+  unsigned osnode = node->os_index;
+  char mscpath[SYSFS_NUMA_NODE_PATH_LEN];
+  DIR *mscdir;
+  struct dirent *dirent;
 
-  if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
-    {
-      hwloc_debug("could not open %s\n", path);
-      return -1;
-    }
+  sprintf(mscpath, "%s/node%u/memory_side_cache", path, osnode);
+  mscdir = hwloc_opendir(mscpath, data->root_fd);
+  if (!mscdir)
+    return -1;
 
-#      define PROCESSOR	"processor"
-#      define PACKAGEID "physical id" /* the longest one */
-#      define COREID "core id"
-  len = 128; /* vendor/model can be very long */
-  str = malloc(len);
-  hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
-  while (fgets(str,len,fd)!=NULL) {
-    unsigned long Ppkg, Pcore, Pproc;
-    char *end, *dot, *prefix, *value;
-    int noend = 0;
+  while ((dirent = readdir(mscdir)) != NULL) {
+    unsigned depth;
+    uint64_t size;
+    unsigned line_size;
+    unsigned associativity;
+    hwloc_obj_t cache;
 
-    /* remove the ending \n */
-    end = strchr(str, '\n');
-    if (end)
-      *end = 0;
-    else
-      noend = 1;
-    /* if empty line, skip and reset curproc */
-    if (!*str) {
-      curproc = -1;
-      continue;
-    }
-    /* skip lines with no dot */
-    dot = strchr(str, ':');
-    if (!dot)
+    if (strncmp(dirent->d_name, "index", 5))
       continue;
-    /* skip lines not starting with a letter */
-    if ((*str > 'z' || *str < 'a')
-	&& (*str > 'Z' || *str < 'A'))
+
+    depth = atoi(dirent->d_name+5);
+
+    sprintf(mscpath, "%s/node%u/memory_side_cache/index%u/size", path, osnode, depth);
+    if (hwloc_read_path_as_uint64(mscpath, &size, data->root_fd) < 0)
       continue;
 
-    /* mark the end of the prefix */
-    prefix = str;
-    end = dot;
-    while (end[-1] == ' ' || end[-1] == '	') end--; /* need a strrspn() */
-    *end = 0;
-    /* find beginning of value, its end is already marked */
-    value = dot+1 + strspn(dot+1, " 	");
+    sprintf(mscpath, "%s/node%u/memory_side_cache/index%u/line_size", path, osnode, depth);
+    if (hwloc_read_path_as_uint(mscpath, &line_size, data->root_fd) < 0)
+      continue;
 
-    /* defines for parsing numbers */
-#   define getprocnb_begin(field, var)					\
-    if (!strcmp(field,prefix)) {					\
-      var = strtoul(value,&endptr,0);					\
-      if (endptr==value) {						\
-	hwloc_debug("no number in "field" field of %s\n", path);	\
-	goto err;							\
-      } else if (var==ULONG_MAX) {					\
-	hwloc_debug("too big "field" number in %s\n", path); 		\
-	goto err;							\
-      }									\
-      hwloc_debug(field " %lu\n", var)
-#   define getprocnb_end()						\
-    }
-    /* actually parse numbers */
-    getprocnb_begin(PROCESSOR, Pproc);
-    curproc = numprocs++;
-    if (numprocs > allocated_Lprocs) {
-      if (!allocated_Lprocs)
-	allocated_Lprocs = 8;
-      else
-        allocated_Lprocs *= 2;
-      Lprocs = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
+    sprintf(mscpath, "%s/node%u/memory_side_cache/index%u/indexing", path, osnode, depth);
+    if (hwloc_read_path_as_uint(mscpath, &associativity, data->root_fd) < 0)
+      continue;
+    /* 0 for direct-mapped, 1 for indexed (don't know how many ways), 2 for custom/other */
+
+    cache = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MEMCACHE, HWLOC_UNKNOWN_INDEX);
+    if (cache) {
+      cache->nodeset = hwloc_bitmap_dup(node->nodeset);
+      cache->cpuset = hwloc_bitmap_dup(node->cpuset);
+      cache->attr->cache.size = size;
+      cache->attr->cache.depth = depth;
+      cache->attr->cache.linesize = line_size;
+      cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+      cache->attr->cache.associativity = !associativity ? 1 /* direct-mapped */ : 0 /* unknown */;
+      hwloc_debug_1arg_bitmap("mscache %s has nodeset %s\n",
+			      dirent->d_name, cache->nodeset);
+
+      cache->memory_first_child = tree;
+      tree = cache;
     }
-    Lprocs[curproc].Pproc = Pproc;
-    Lprocs[curproc].Pcore = -1;
-    Lprocs[curproc].Ppkg = -1;
-    Lprocs[curproc].Lcore = -1;
-    Lprocs[curproc].Lpkg = -1;
-    Lprocs[curproc].infos = NULL;
-    Lprocs[curproc].infos_count = 0;
-    getprocnb_end() else
-    getprocnb_begin(PACKAGEID, Ppkg);
-    Lprocs[curproc].Ppkg = Ppkg;
-    getprocnb_end() else
-    getprocnb_begin(COREID, Pcore);
-    Lprocs[curproc].Pcore = Pcore;
-    getprocnb_end() else {
+  }
+  closedir(mscdir);
+  *treep = tree;
+  return 0;
+}
 
-      /* architecture specific or default routine for parsing cpumodel */
-      if (!parse_cpuinfo_func) {
-	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
-	if (*data->utsname.machine) {
-	  /* x86_32 x86_64 k1om => x86 */
-	  if (!strcmp(data->utsname.machine, "x86_64")
-	      || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
-	      || !strcmp(data->utsname.machine, "k1om"))
-	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
-	  /* ia64 */
-	  else if (!strcmp(data->utsname.machine, "ia64"))
-	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
-	  /* arm */
-	  else if (!strncmp(data->utsname.machine, "arm", 3))
-	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
-	  else if (!strncmp(data->utsname.machine, "ppc", 3)
-		   || !strncmp(data->utsname.machine, "power", 5))
-	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
-	}
-      }
-      /* we can't assume that we already got a processor index line:
-       * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
-       * tile has a global section with model name before the list of processor lines.
-       */
-      parse_cpuinfo_func(prefix, value,
-			 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
-			 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
-			 curproc < 0);
-    }
+static unsigned *
+list_sysfsnode(struct hwloc_topology *topology,
+	       struct hwloc_linux_backend_data_s *data,
+	       const char *path,
+	       unsigned *nbnodesp)
+{
+  DIR *dir;
+  unsigned osnode, nbnodes = 0;
+  unsigned *indexes, index_;
+  hwloc_bitmap_t nodeset;
+  struct dirent *dirent;
 
-    if (noend) {
-      /* ignore end of line */
-      if (fscanf(fd,"%*[^\n]") == EOF)
-	break;
-      getc(fd);
-    }
+  /* try to get the list of NUMA nodes at once.
+   * otherwise we'll list the entire directory.
+   *
+   * offline nodes don't exist at all under /sys (they are in "possible", we may ignore them).
+   *
+   * don't use <path>/online, /sys/bus/node/devices only contains node%d
+   */
+  nodeset = hwloc__alloc_read_path_as_cpulist("/sys/devices/system/node/online", data->root_fd);
+  if (nodeset) {
+    int _nbnodes = hwloc_bitmap_weight(nodeset);
+    assert(_nbnodes >= 1);
+    nbnodes = (unsigned)_nbnodes;
+    hwloc_debug_bitmap("possible NUMA nodes %s\n", nodeset);
+    goto found;
   }
-  fclose(fd);
-  free(str);
 
-  *Lprocs_p = Lprocs;
-  return numprocs;
+  /* Get the list of nodes first */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (!dir)
+    return NULL;
 
- err:
-  fclose(fd);
-  free(str);
-  free(Lprocs);
-  return -1;
-}
+  nodeset = hwloc_bitmap_alloc();
+  if (!nodeset) {
+    closedir(dir);
+    return NULL;
+  }
 
-static void
-hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
-			 struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
-{
-  if (Lprocs) {
-    unsigned i;
-    for(i=0; i<numprocs; i++) {
-      hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
-    }
-    free(Lprocs);
+  while ((dirent = readdir(dir)) != NULL) {
+    if (strncmp(dirent->d_name, "node", 4))
+      continue;
+    osnode = strtoul(dirent->d_name+4, NULL, 0);
+    hwloc_bitmap_set(nodeset, osnode);
+    nbnodes++;
   }
-  hwloc__free_infos(global_infos, global_infos_count);
-}
+  closedir(dir);
 
-static int
-look_cpuinfo(struct hwloc_topology *topology,
-	     struct hwloc_linux_backend_data_s *data,
-	     const char *path)
-{
-  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
-  struct hwloc_obj_info_s *global_infos = NULL;
-  unsigned global_infos_count = 0;
-  /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
-  unsigned *Lcore_to_Pcore;
-  unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
-  unsigned *Lpkg_to_Ppkg;
-  int _numprocs;
-  unsigned numprocs;
-  unsigned numpkgs=0;
-  unsigned numcores=0;
-  unsigned long Lproc;
-  unsigned missingpkg;
-  unsigned missingcore;
-  unsigned i,j;
+  assert(nbnodes >= 1); /* linux cannot have a "node/" subdirectory without at least one "node%d" */
+
+  /* we don't know if sysfs returns nodes in order, we can't merge above and below loops */
+
+ found:
+  /* if there are already some nodes, we'll annotate them. make sure the indexes match */
+  if (!hwloc_bitmap_iszero(topology->levels[0][0]->nodeset)
+      && !hwloc_bitmap_isequal(nodeset, topology->levels[0][0]->nodeset)) {
+    char *sn, *tn;
+    hwloc_bitmap_asprintf(&sn, nodeset);
+    hwloc_bitmap_asprintf(&tn, topology->levels[0][0]->nodeset);
+    fprintf(stderr, "linux/sysfs: ignoring nodes because nodeset %s doesn't match existing nodeset %s.\n", tn, sn);
+    free(sn);
+    free(tn);
+    hwloc_bitmap_free(nodeset);
+    return NULL;
+  }
 
-  /* parse the entire cpuinfo first, fill the Lprocs array and numprocs */
-  _numprocs = hwloc_linux_parse_cpuinfo(data, path, &Lprocs, &global_infos, &global_infos_count);
+  indexes = calloc(nbnodes, sizeof(*indexes));
+  if (!indexes) {
+    hwloc_bitmap_free(nodeset);
+    return NULL;
+  }
 
+  /* Unsparsify node indexes.
+   * We'll need them later because Linux groups sparse distances
+   * and keeps them in order in the sysfs distance files.
+   * It'll simplify things in the meantime.
+   */
+  index_ = 0;
+  hwloc_bitmap_foreach_begin (osnode, nodeset) {
+    indexes[index_] = osnode;
+    index_++;
+  } hwloc_bitmap_foreach_end();
 
-  /* setup root info */
-  hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
-		    &global_infos, &global_infos_count);
+  hwloc_bitmap_free(nodeset);
 
+#ifdef HWLOC_DEBUG
+  hwloc_debug("%s", "NUMA indexes: ");
+  for (index_ = 0; index_ < nbnodes; index_++)
+    hwloc_debug(" %u", indexes[index_]);
+  hwloc_debug("%s", "\n");
+#endif
 
-  if (_numprocs <= 0)
-    /* found no processor */
-    return -1;
-  numprocs = _numprocs;
+  *nbnodesp = nbnodes;
+  return indexes;
+}
 
-  /* initialize misc arrays, there can be at most numprocs entries */
-  Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
-  Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
-  Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
-  for (i = 0; i < numprocs; i++) {
-    Lcore_to_Pcore[i] = -1;
-    Lcore_to_Ppkg[i] = -1;
-    Lpkg_to_Ppkg[i] = -1;
-  }
+static int
+annotate_sysfsnode(struct hwloc_topology *topology,
+		   struct hwloc_linux_backend_data_s *data,
+		   const char *path, unsigned *found)
+{
+  unsigned nbnodes;
+  hwloc_obj_t * nodes; /* the array of NUMA node objects, to be used for inserting distances */
+  hwloc_obj_t node;
+  unsigned * indexes;
+  uint64_t * distances;
+  unsigned i;
 
-  /* create PU objects */
-  for(Lproc=0; Lproc<numprocs; Lproc++) {
-    unsigned long Pproc = Lprocs[Lproc].Pproc;
-    hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
-    obj->cpuset = hwloc_bitmap_alloc();
-    hwloc_bitmap_only(obj->cpuset, Pproc);
-    hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
-			     Lproc, Pproc, obj->cpuset);
-    hwloc_insert_object_by_cpuset(topology, obj);
-  }
+  /* NUMA nodes cannot be filtered out */
+  indexes = list_sysfsnode(topology, data, path, &nbnodes);
+  if (!indexes)
+    return 0;
 
-  topology->support.discovery->pu = 1;
+  nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+  distances = malloc(nbnodes*nbnodes*sizeof(*distances));
+  if (NULL == nodes || NULL == distances) {
+    free(nodes);
+    free(indexes);
+    free(distances);
+    return 0;
+  }
 
-  hwloc_debug("%s", "\n * Topology summary *\n");
-  hwloc_debug("%u processors)\n", numprocs);
+  for(node=hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, NULL);
+      node != NULL;
+      node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, node)) {
+    assert(node); /* list_sysfsnode() ensured that sysfs nodes and existing nodes match */
 
-  /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
-  for(Lproc=0; Lproc<numprocs; Lproc++) {
-    long Ppkg = Lprocs[Lproc].Ppkg;
-    if (Ppkg != -1) {
-      unsigned long Pproc = Lprocs[Lproc].Pproc;
-      for (i=0; i<numpkgs; i++)
-	if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
-	  break;
-      Lprocs[Lproc].Lpkg = i;
-      hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
-      if (i==numpkgs) {
-	Lpkg_to_Ppkg[numpkgs] = Ppkg;
-	numpkgs++;
+    /* hwloc_parse_nodes_distances() requires nodes in physical index order,
+     * and inserting distances requires nodes[] and indexes[] in same order.
+     */
+    for(i=0; i<nbnodes; i++)
+      if (indexes[i] == node->os_index) {
+	nodes[i] = node;
+	break;
       }
-    }
-  }
-  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
-   * provide bogus information. We should rather drop it. */
-  missingpkg=0;
-  for(j=0; j<numprocs; j++)
-    if (Lprocs[i].Ppkg == -1) {
-      missingpkg=1;
-      break;
-    }
-  /* create package objects */
-  hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
-  if (!missingpkg && numpkgs>0) {
-    for (i = 0; i < numpkgs; i++) {
-      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
-      int doneinfos = 0;
-      obj->cpuset = hwloc_bitmap_alloc();
-      for(j=0; j<numprocs; j++)
-	if ((unsigned) Lprocs[j].Lpkg == i) {
-	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
-	  if (!doneinfos) {
-	    hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
-	    doneinfos = 1;
-	  }
-	}
-      hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
-      hwloc_insert_object_by_cpuset(topology, obj);
-    }
-    hwloc_debug("%s", "\n");
+
+    hwloc_get_sysfs_node_meminfo(data, path, node->os_index, &node->attr->numanode);
   }
 
-  /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
-  for(Lproc=0; Lproc<numprocs; Lproc++) {
-    long Pcore = Lprocs[Lproc].Pcore;
-    if (Pcore != -1) {
-      for (i=0; i<numcores; i++)
-	if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
-	  break;
-      Lprocs[Lproc].Lcore = i;
-      if (i==numcores) {
-	Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
-	Lcore_to_Pcore[numcores] = Pcore;
-	numcores++;
-      }
-    }
-  }
-  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
-   * provide bogus information. We should rather drop it. */
-  missingcore=0;
-  for(j=0; j<numprocs; j++)
-    if (Lprocs[i].Pcore == -1) {
-      missingcore=1;
-      break;
-    }
-  /* create Core objects */
-  hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
-  if (!missingcore && numcores>0) {
-    for (i = 0; i < numcores; i++) {
-      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
-      obj->cpuset = hwloc_bitmap_alloc();
-      for(j=0; j<numprocs; j++)
-	if ((unsigned) Lprocs[j].Lcore == i)
-	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
-      hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
-      hwloc_insert_object_by_cpuset(topology, obj);
-    }
-    hwloc_debug("%s", "\n");
-  }
-
-  free(Lcore_to_Pcore);
-  free(Lcore_to_Ppkg);
-  free(Lpkg_to_Ppkg);
-
-  hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
-
-  look_powerpc_device_tree(topology, data);
-  return 0;
-}
-
-
-
-/*************************************
- ****** Main Topology Discovery ******
- *************************************/
-
-static void
-hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
-{
-  FILE *file;
-  char line[64], *tmp, *end;
-  file = hwloc_fopen("/proc/elog", "r", data->root_fd);
-  if (!file)
-    return;
-  if (!fgets(line, sizeof(line), file))
-    goto out_with_file;
-  if (strncmp(line, "Card ", 5))
-    goto out_with_file;
-  tmp = line + 5;
-  end = strchr(tmp, ':');
-  if (!end)
-    goto out_with_file;
-  *end = '\0';
-  hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
-
- out_with_file:
-  fclose(file);
-}
-
-static void
-hwloc_linux_fallback_pu_level(struct hwloc_topology *topology)
-{
-  if (topology->is_thissystem)
-    hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
-  else
-    /* fsys-root but not this system, no way, assume there's just 1
-     * processor :/ */
-    hwloc_setup_pu_level(topology, 1);
-}
-
-static void
-hwloc_gather_system_info(struct hwloc_topology *topology,
-			 struct hwloc_linux_backend_data_s *data)
-{
-  FILE *file;
-  char line[128]; /* enough for utsname fields */
-  const char *env;
-
-  /* initialize to something sane */
-  memset(&data->utsname, 0, sizeof(data->utsname));
-
-  /* read thissystem info */
-  if (topology->is_thissystem)
-    uname(&data->utsname);
-
-  /* overwrite with optional /proc/hwloc-nofile-info */
-  file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
-  if (file) {
-    while (fgets(line, sizeof(line), file)) {
-      char *tmp = strchr(line, '\n');
-      if (!strncmp("OSName: ", line, 8)) {
-	if (tmp)
-	  *tmp = '\0';
-	strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
-	data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
-      } else if (!strncmp("OSRelease: ", line, 11)) {
-	if (tmp)
-	  *tmp = '\0';
-	strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
-	data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
-      } else if (!strncmp("OSVersion: ", line, 11)) {
-	if (tmp)
-	  *tmp = '\0';
-	strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
-	data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
-      } else if (!strncmp("HostName: ", line, 10)) {
-	if (tmp)
-	  *tmp = '\0';
-	strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
-	data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
-      } else if (!strncmp("Architecture: ", line, 14)) {
-	if (tmp)
-	  *tmp = '\0';
-	strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
-	data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
-      } else {
-	hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
-	/* ignored */
-      }
-    }
-    fclose(file);
-  }
+  topology->support.discovery->numa = 1;
+  topology->support.discovery->numa_memory = 1;
+  topology->support.discovery->disallowed_numa = 1;
 
-  env = getenv("HWLOC_DUMP_NOFILE_INFO");
-  if (env && *env) {
-    file = fopen(env, "w");
-    if (file) {
-      if (*data->utsname.sysname)
-	fprintf(file, "OSName: %s\n", data->utsname.sysname);
-      if (*data->utsname.release)
-	fprintf(file, "OSRelease: %s\n", data->utsname.release);
-      if (*data->utsname.version)
-	fprintf(file, "OSVersion: %s\n", data->utsname.version);
-      if (*data->utsname.nodename)
-	fprintf(file, "HostName: %s\n", data->utsname.nodename);
-      if (*data->utsname.machine)
-	fprintf(file, "Architecture: %s\n", data->utsname.machine);
-      fclose(file);
-    }
+  if (nbnodes >= 2
+      && data->use_numa_distances
+      && !hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd)) {
+    hwloc_internal_distances_add(topology, "NUMALatency", nbnodes, nodes, distances,
+				 HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY,
+				 HWLOC_DISTANCES_ADD_FLAG_GROUP);
+  } else {
+    free(nodes);
+    free(distances);
   }
 
-#if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
-  if (topology->is_thissystem)
-    data->arch = HWLOC_LINUX_ARCH_X86;
-#endif
-  if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
-    if (!strcmp(data->utsname.machine, "x86_64")
-	|| (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
-	|| !strcmp(data->utsname.machine, "k1om"))
-      data->arch = HWLOC_LINUX_ARCH_X86;
-    else if (!strncmp(data->utsname.machine, "arm", 3))
-      data->arch = HWLOC_LINUX_ARCH_ARM;
-    else if (!strncmp(data->utsname.machine, "ppc", 3)
-	     || !strncmp(data->utsname.machine, "power", 5))
-      data->arch = HWLOC_LINUX_ARCH_POWER;
-    else if (!strcmp(data->utsname.machine, "ia64"))
-      data->arch = HWLOC_LINUX_ARCH_IA64;
-  }
+  free(indexes);
+  *found = nbnodes;
+  return 0;
 }
 
 static int
-hwloc_look_linuxfs(struct hwloc_backend *backend)
+look_sysfsnode(struct hwloc_topology *topology,
+	       struct hwloc_linux_backend_data_s *data,
+	       const char *path, unsigned *found)
 {
-  struct hwloc_topology *topology = backend->topology;
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  unsigned osnode;
   unsigned nbnodes;
-  char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
-  int err;
+  hwloc_obj_t * nodes; /* the array of NUMA node objects, to be used for inserting distances */
+  unsigned nr_trees;
+  hwloc_obj_t * trees; /* the array of memory hierarchies to insert */
+  unsigned *indexes;
+  uint64_t * distances;
+  hwloc_bitmap_t nodes_cpuset;
+  unsigned failednodes = 0;
+  unsigned i;
+  DIR *dir;
+  int allow_overlapping_node_cpusets = (getenv("HWLOC_DEBUG_ALLOW_OVERLAPPING_NODE_CPUSETS") != NULL);
+  int need_memcaches = hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_MEMCACHE);
 
-  if (topology->levels[0][0]->cpuset)
-    /* somebody discovered things */
+  /* NUMA nodes cannot be filtered out */
+  indexes = list_sysfsnode(topology, data, path, &nbnodes);
+  if (!indexes)
     return 0;
 
-  hwloc_gather_system_info(topology, data);
-
-  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-
-  /* Gather the list of admin-disabled cpus and mems */
-  hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, data->root_fd);
-  if (cgroup_mntpnt || cpuset_mntpnt) {
-    cpuset_name = hwloc_read_linux_cpuset_name(data->root_fd, topology->pid);
-    if (cpuset_name) {
-      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
-      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
-    }
-    free(cgroup_mntpnt);
-    free(cpuset_mntpnt);
+  nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+  trees = calloc(nbnodes, sizeof(hwloc_obj_t));
+  distances = malloc(nbnodes*nbnodes*sizeof(*distances));
+  nodes_cpuset  = hwloc_bitmap_alloc();
+  if (NULL == nodes || NULL == trees || NULL == distances || NULL == nodes_cpuset) {
+    free(nodes);
+    free(trees);
+    free(indexes);
+    free(distances);
+    hwloc_bitmap_free(nodes_cpuset);
+    nbnodes = 0;
+    goto out;
   }
 
-    /* Get the machine memory attributes */
-    hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
+  topology->support.discovery->numa = 1;
+  topology->support.discovery->numa_memory = 1;
+  topology->support.discovery->disallowed_numa = 1;
 
-    /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
-    if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
-      look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
+  /* Create NUMA objects */
+  for (i = 0; i < nbnodes; i++) {
+    hwloc_obj_t node;
+    char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+    hwloc_bitmap_t cpuset;
 
-    /* if we found some numa nodes, the machine object has no local memory */
-    if (nbnodes) {
-      unsigned i;
-      topology->levels[0][0]->memory.local_memory = 0;
-      if (topology->levels[0][0]->memory.page_types)
-        for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
-          topology->levels[0][0]->memory.page_types[i].count = 0;
+    osnode = indexes[i];
+    sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
+    cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
+    if (!cpuset) {
+      /* This NUMA object won't be inserted, we'll ignore distances */
+      failednodes++;
+      continue;
     }
-
-    /* Gather the list of cpus now */
-    if (getenv("HWLOC_LINUX_USE_CPUINFO")
-	|| (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
-	    && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
-	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
-	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
-	/* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
-	 * or not containing anything interesting */
-      err = look_cpuinfo(topology, data, "/proc/cpuinfo");
-      if (err < 0)
-	hwloc_linux_fallback_pu_level(topology);
-
-    } else {
-      struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
-      struct hwloc_obj_info_s *global_infos = NULL;
-      unsigned global_infos_count = 0;
-      int numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
-      if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
-      unsigned i;
-      const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
-      for(i=0; i<Lprocs[0].infos_count; i++) {
-	if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
-	  cpuvendor = Lprocs[0].infos[i].value;
-	} else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
-	  cpufamilynumber = Lprocs[0].infos[i].value;
-	} else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
-	  cpumodelnumber = Lprocs[0].infos[i].value;
-	}
+    if (hwloc_bitmap_intersects(nodes_cpuset, cpuset)) {
+      /* Buggy BIOS with overlapping NUMA node cpusets, impossible on Linux so far, we should ignore them.
+       * But it may be useful for debugging the core.
+       */
+      if (!allow_overlapping_node_cpusets) {
+	hwloc_debug_1arg_bitmap("node P#%u cpuset %s intersects with previous nodes, ignoring that node.\n", osnode, cpuset);
+	hwloc_bitmap_free(cpuset);
+	failednodes++;
+	continue;
       }
-      if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
-	  && cpufamilynumber && !strcmp(cpufamilynumber, "6")
-	  && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
-	  || !strcmp(cpumodelnumber, "133")))
-	data->is_knl = 1;
-      if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
-	  && cpufamilynumber
-	  && (!strcmp(cpufamilynumber, "21")
-	      || !strcmp(cpufamilynumber, "22")))
-	data->is_amd_with_CU = 1;
-}
-      if (numprocs <= 0)
-	Lprocs = NULL;
-      if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
-        if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
-	  /* sysfs but we failed to read cpu topology, fallback */
-	  hwloc_linux_fallback_pu_level(topology);
-      hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
-			&global_infos, &global_infos_count);
-      hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+      fprintf(stderr, "node P#%u cpuset intersects with previous nodes, forcing its acceptance\n", osnode);
     }
+    hwloc_bitmap_or(nodes_cpuset, nodes_cpuset, cpuset);
+
+    node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, osnode);
+    node->cpuset = cpuset;
+    node->nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(node->nodeset, osnode);
+    hwloc_get_sysfs_node_meminfo(data, path, osnode, &node->attr->numanode);
+
+    nodes[i] = node;
+    hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+			  osnode, node->cpuset);
+  }
+
+      /* try to find NUMA nodes that correspond to NVIDIA GPU memory */
+      dir = hwloc_opendir("/proc/driver/nvidia/gpus", data->root_fd);
+      if (dir) {
+	struct dirent *dirent;
+	char *env = getenv("HWLOC_KEEP_NVIDIA_GPU_NUMA_NODES");
+	int keep = env && atoi(env);
+	while ((dirent = readdir(dir)) != NULL) {
+	  char nvgpunumapath[300], line[256];
+	  int fd;
+	  snprintf(nvgpunumapath, sizeof(nvgpunumapath), "/proc/driver/nvidia/gpus/%s/numa_status", dirent->d_name);
+	  fd = hwloc_open(nvgpunumapath, data->root_fd);
+	  if (fd >= 0) {
+	    int ret;
+	    ret = read(fd, line, sizeof(line)-1);
+	    line[sizeof(line)-1] = '\0';
+	    if (ret >= 0) {
+	      const char *nvgpu_node_line = strstr(line, "Node:");
+	      if (nvgpu_node_line) {
+		unsigned nvgpu_node;
+		const char *value = nvgpu_node_line+5;
+		while (*value == ' ' || *value == '\t')
+		  value++;
+		nvgpu_node = atoi(value);
+		hwloc_debug("os node %u is NVIDIA GPU %s integrated memory\n", nvgpu_node, dirent->d_name);
+		for(i=0; i<nbnodes; i++) {
+		  hwloc_obj_t node = nodes[i];
+		  if (node && node->os_index == nvgpu_node) {
+		    if (keep) {
+		      /* keep this NUMA node but fixed its locality and add an info about the GPU */
+		      char nvgpulocalcpuspath[300];
+		      int err;
+		      node->subtype = strdup("GPUMemory");
+		      hwloc_obj_add_info(node, "PCIBusID", dirent->d_name);
+		      snprintf(nvgpulocalcpuspath, sizeof(nvgpulocalcpuspath), "/sys/bus/pci/devices/%s/local_cpus", dirent->d_name);
+		      err = hwloc__read_path_as_cpumask(nvgpulocalcpuspath, node->cpuset, data->root_fd);
+		      if (err)
+			/* the core will attach to the root */
+			hwloc_bitmap_zero(node->cpuset);
+		    } else {
+		      /* drop this NUMA node */
+		      hwloc_free_unlinked_object(node);
+		      nodes[i] = NULL;
+		    }
+		    break;
+		  }
+		}
+	      }
+	    }
+	    close(fd);
+	  }
+	}
+	closedir(dir);
+      }
 
-  /* Gather DMI info */
-  hwloc__get_dmi_id_info(data, topology->levels[0][0]);
-  if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
-    hwloc__get_firmware_dmi_memory_info(topology, data);
+      /* try to find DAX devices of KMEM NUMA nodes */
+      dir = hwloc_opendir("/sys/bus/dax/devices/", data->root_fd);
+      if (dir) {
+	struct dirent *dirent;
+	while ((dirent = readdir(dir)) != NULL) {
+	  char daxpath[300];
+	  int tmp;
+	  osnode = (unsigned) -1;
+	  snprintf(daxpath, sizeof(daxpath), "/sys/bus/dax/devices/%s/target_node", dirent->d_name);
+	  if (!hwloc_read_path_as_int(daxpath, &tmp, data->root_fd)) { /* contains %d when added in 5.1 */
+	    osnode = (unsigned) tmp;
+	    for(i=0; i<nbnodes; i++) {
+	      hwloc_obj_t node = nodes[i];
+	      if (node && node->os_index == osnode)
+		hwloc_obj_add_info(node, "DAXDevice", dirent->d_name);
+	    }
+	  }
+	}
+	closedir(dir);
+      }
 
-  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
-  if (cpuset_name) {
-    hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
-    free(cpuset_name);
-  }
+      topology->support.discovery->numa = 1;
+      topology->support.discovery->numa_memory = 1;
+      topology->support.discovery->disallowed_numa = 1;
 
-  hwloc__linux_get_mic_sn(topology, data);
+      hwloc_bitmap_free(nodes_cpuset);
 
-  /* data->utsname was filled with real uname or \0, we can safely pass it */
-  hwloc_add_uname_info(topology, &data->utsname);
+      if (nbnodes <= 1) {
+	/* failed to read/create some nodes, don't bother reading/fixing
+	 * a distance matrix that would likely be wrong anyway.
+	 */
+	data->use_numa_distances = 0;
+      }
 
-  return 1;
-}
+      if (!data->use_numa_distances) {
+	free(distances);
+	distances = NULL;
+      }
 
+      if (distances && hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
+	free(distances);
+	distances = NULL;
+      }
 
+      free(indexes);
+
+      if (data->is_knl) {
+	/* apply KNL quirks */
+	char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
+	int noquirk = (env && !atoi(env));
+	if (!noquirk) {
+	  hwloc_linux_knl_numa_quirk(topology, data, nodes, nbnodes, distances, &failednodes);
+	  free(distances);
+	  free(nodes);
+	  free(trees);
+	  goto out;
+	}
+      }
 
-/****************************************
- ***** Linux PCI backend callbacks ******
- ****************************************
- * Do not support changing the fsroot (use sysfs)
- */
+      /* Fill the array of trees */
+      nr_trees = 0;
+      /* First list nodes that have a non-empty cpumap.
+       * They are likely the default nodes where we want to allocate from (DDR),
+       * make sure they are listed first in their parent memory subtree.
+       */
+      for (i = 0; i < nbnodes; i++) {
+	hwloc_obj_t node = nodes[i];
+	if (node && !hwloc_bitmap_iszero(node->cpuset)) {
+	  hwloc_obj_t tree;
+	  /* update from HMAT initiators if any */
+	  if (data->use_numa_initiators)
+	    read_node_initiators(data, node, nbnodes, nodes, path);
+
+	  tree = node;
+	  if (need_memcaches)
+	    read_node_mscaches(topology, data, path, &tree);
+	  trees[nr_trees++] = tree;
+	}
+      }
+      /* Now look for empty-cpumap nodes.
+       * Those may be the non-default nodes for allocation.
+       * Hence we don't want them to be listed first,
+       * especially if we end up fixing their actual cpumap.
+       */
+      for (i = 0; i < nbnodes; i++) {
+	hwloc_obj_t node = nodes[i];
+	if (node && hwloc_bitmap_iszero(node->cpuset)) {
+	  hwloc_obj_t tree;
+	  /* update from HMAT initiators if any */
+	  if (data->use_numa_initiators)
+	    if (!read_node_initiators(data, node, nbnodes, nodes, path))
+	      if (!hwloc_bitmap_iszero(node->cpuset))
+		goto fixed;
+
+	  /* if HMAT didn't help, try to find locality of CPU-less NUMA nodes by looking at their distances */
+	  if (distances && data->use_numa_distances_for_cpuless)
+	    fixup_cpuless_node_locality_from_distances(i, nbnodes, nodes, distances);
+
+	fixed:
+	  tree = node;
+	  if (need_memcaches)
+	    read_node_mscaches(topology, data, path, &tree);
+	  trees[nr_trees++] = tree;
+	}
+      }
 
-static hwloc_obj_t
-hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
-{
-  struct hwloc_topology *topology = backend->topology;
-  struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
-  obj->name = strdup(name);
-  obj->logical_index = -1;
-  obj->attr->osdev.type = type;
+      /* insert memory trees for real */
+      for (i = 0; i < nr_trees; i++) {
+	hwloc_obj_t tree = trees[i];
+	while (tree) {
+	  hwloc_obj_t cur_obj;
+	  hwloc_obj_t res_obj;
+	  hwloc_obj_type_t cur_type;
+	  cur_obj = tree;
+	  cur_type = cur_obj->type;
+	  tree = cur_obj->memory_first_child;
+	  assert(!cur_obj->next_sibling);
+	  res_obj = hwloc__insert_object_by_cpuset(topology, NULL, cur_obj, hwloc_report_os_error);
+	  if (res_obj != cur_obj && cur_type == HWLOC_OBJ_NUMANODE) {
+	    /* This NUMA node got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
+	     * Update it in the array for the distance matrix. */
+	    unsigned j;
+	    for(j=0; j<nbnodes; j++)
+	      if (nodes[j] == cur_obj)
+		nodes[j] = res_obj;
+	    failednodes++;
+	  }
+	}
+      }
+      free(trees);
 
-  hwloc_insert_object_by_parent(topology, pcidev, obj);
-  /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+      /* Inserted distances now that nodes are properly inserted */
+      if (distances)
+	hwloc_internal_distances_add(topology, "NUMALatency", nbnodes, nodes, distances,
+				     HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY,
+				     HWLOC_DISTANCES_ADD_FLAG_GROUP);
+      else
+	free(nodes);
 
-  return obj;
+ out:
+  *found = nbnodes - failednodes;
+  return 0;
 }
 
-typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
-
-/* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
-
-static void
-hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
+/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
+static int
+look_sysfscpu(struct hwloc_topology *topology,
+	      struct hwloc_linux_backend_data_s *data,
+	      const char *path, int old_filenames,
+	      struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
 {
-  int root_fd = data->root_fd;
+  hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
+  hwloc_bitmap_t online_set; /* Set of online CPUs if easily available, or NULL */
+#define CPU_TOPOLOGY_STR_LEN 128
+  char str[CPU_TOPOLOGY_STR_LEN];
   DIR *dir;
-  struct dirent *dirent;
-  char path[128];
-  struct stat st;
+  int i,j;
+  unsigned caches_added;
+  int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
 
-  data->deprecated_classlinks_model = -1;
+  /* try to get the list of online CPUs at once.
+   * otherwise we'll use individual per-CPU "online" files.
+   *
+   * don't use <path>/online, /sys/bus/cpu/devices only contains cpu%d
+   */
+  online_set = hwloc__alloc_read_path_as_cpulist("/sys/devices/system/cpu/online", data->root_fd);
+  if (online_set)
+    hwloc_debug_bitmap("online CPUs %s\n", online_set);
 
-  dir = hwloc_opendir("/sys/class/net", root_fd);
-  if (!dir)
-    return;
-  while ((dirent = readdir(dir)) != NULL) {
-    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
-      continue;
-    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
-    if (hwloc_stat(path, &st, root_fd) == 0) {
-      data->deprecated_classlinks_model = 0;
-      goto out;
-    }
-    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
-    if (hwloc_stat(path, &st, root_fd) == 0) {
-      data->deprecated_classlinks_model = 1;
-      goto out;
-    }
-  }
-out:
-  closedir(dir);
-}
+  /* fill the cpuset of interesting cpus */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (!dir) {
+    hwloc_bitmap_free(online_set);
+    return -1;
+  } else {
+    struct dirent *dirent;
+    cpuset = hwloc_bitmap_alloc();
 
-/* class objects that are immediately below pci devices:
- * look for objects of the given classname below a sysfs (pcidev) directory
- */
-static int
-hwloc_linux_class_readdir(struct hwloc_backend *backend,
-			  struct hwloc_obj *pcidev, const char *devicepath,
-			  hwloc_obj_osdev_type_t type, const char *classname,
-			  hwloc_linux_class_fillinfos_t fillinfo)
-{
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  int root_fd = data->root_fd;
-  size_t classnamelen = strlen(classname);
-  char path[256];
-  DIR *dir;
-  struct dirent *dirent;
-  hwloc_obj_t obj;
-  int res = 0, err;
+    while ((dirent = readdir(dir)) != NULL) {
+      unsigned long cpu;
+      char online[2];
 
-  if (data->deprecated_classlinks_model == -2)
-    hwloc_linux_check_deprecated_classlinks_model(data);
+      if (strncmp(dirent->d_name, "cpu", 3))
+	continue;
+      cpu = strtoul(dirent->d_name+3, NULL, 0);
 
-  if (data->deprecated_classlinks_model != 1) {
-    /* modern sysfs: <device>/<class>/<name> */
-    struct stat st;
-    snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
+      /* Maybe we don't have topology information but at least it exists */
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
 
-    /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
-     * make sure <device>/<class> is a directory to avoid this case.
-     */
-    err = hwloc_lstat(path, &st, root_fd);
-    if (err < 0 || !S_ISDIR(st.st_mode))
-      goto trydeprecated;
-
-    dir = hwloc_opendir(path, root_fd);
-    if (dir) {
-      data->deprecated_classlinks_model = 0;
-      while ((dirent = readdir(dir)) != NULL) {
-	if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      /* check whether this processor is online */
+      if (online_set) {
+	if (!hwloc_bitmap_isset(online_set, cpu)) {
+	  hwloc_debug("os proc %lu is offline\n", cpu);
 	  continue;
-	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
-	if (fillinfo) {
-	  snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
-	  fillinfo(backend, obj, path);
 	}
-	res++;
+      } else {
+	/* /sys/devices/system/cpu/online unavailable, check the cpu online file */
+	sprintf(str, "%s/cpu%lu/online", path, cpu);
+	if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
+	  if (!atoi(online)) {
+	    hwloc_debug("os proc %lu is offline\n", cpu);
+	    continue;
+	  }
+	}
       }
-      closedir(dir);
-      return res;
-    }
-  }
 
-trydeprecated:
-  if (data->deprecated_classlinks_model != 0) {
-    /* deprecated sysfs: <device>/<class>:<name> */
-    dir = hwloc_opendir(devicepath, root_fd);
-    if (dir) {
-      while ((dirent = readdir(dir)) != NULL) {
-	if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
-	  continue;
-	data->deprecated_classlinks_model = 1;
-	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
-	if (fillinfo) {
-	  snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
-	  fillinfo(backend, obj, path);
-	}
-	res++;
+      /* check whether the kernel exports topology information for this cpu */
+      sprintf(str, "%s/cpu%lu/topology", path, cpu);
+      if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
+	hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
+		   cpu, path, cpu);
+	continue;
       }
-      closedir(dir);
-      return res;
+
+      hwloc_bitmap_set(cpuset, cpu);
     }
+    closedir(dir);
   }
 
-  return 0;
-}
+  topology->support.discovery->pu = 1;
+  topology->support.discovery->disallowed_pu = 1;
+  hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
+	     hwloc_bitmap_weight(cpuset), cpuset);
 
-/*
- * look for net objects below a pcidev in sysfs
- */
-static void
-hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
-				struct hwloc_obj *obj, const char *osdevpath)
-{
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  int root_fd = data->root_fd;
-  FILE *fd;
-  struct stat st;
-  char path[256];
-  snprintf(path, sizeof(path), "%s/address", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char address[128];
-    if (fgets(address, sizeof(address), fd)) {
-      char *eol = strchr(address, '\n');
-      if (eol)
-        *eol = 0;
-      hwloc_obj_add_info(obj, "Address", address);
-    }
-    fclose(fd);
-  }
-  snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
-  if (!hwloc_stat(path, &st, root_fd)) {
-    snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
-    fd = hwloc_fopen(path, "r", root_fd);
-    if (fd) {
-      char hexid[16];
-      if (fgets(hexid, sizeof(hexid), fd)) {
-	char *eoid;
-	unsigned long port;
-	port = strtoul(hexid, &eoid, 0);
-	if (eoid != hexid) {
-	  char portstr[16];
-	  snprintf(portstr, sizeof(portstr), "%ld", port+1);
-	  hwloc_obj_add_info(obj, "Port", portstr);
-	}
-      }
-      fclose(fd);
-    }
-  }
-}
+  caches_added = 0;
+  hwloc_bitmap_foreach_begin(i, cpuset) {
+    int tmpint;
+    int notfirstofcore = 0; /* set if we have core info and if we're not the first PU of our core */
+    int notfirstofdie = 0; /* set if we have die info and if we're not the first PU of our die */
+    hwloc_bitmap_t dieset = NULL;
 
-static int
-hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
-			     struct hwloc_obj *pcidev, const char *pcidevpath)
-{
-  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
-}
+    if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+      /* look at the core */
+      hwloc_bitmap_t coreset;
+      if (old_filenames)
+	sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
+      else
+	sprintf(str, "%s/cpu%d/topology/core_cpus", path, i);
+      coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+      if (coreset) {
+        unsigned mycoreid = (unsigned) -1;
+	int gotcoreid = 0; /* to avoid reading the coreid twice */
+	hwloc_bitmap_and(coreset, coreset, cpuset);
+	if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
+	  /* check if this is hyper-threading or different coreids */
+	  unsigned siblingid, siblingcoreid;
+
+	  mycoreid = (unsigned) -1;
+	  sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.19 */
+	  if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
+	    mycoreid = (unsigned) tmpint;
+	  gotcoreid = 1;
+
+	  siblingid = hwloc_bitmap_first(coreset);
+	  if (siblingid == (unsigned) i)
+	    siblingid = hwloc_bitmap_next(coreset, i);
+	  siblingcoreid = (unsigned) -1;
+	  sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.19 */
+	  if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
+	    siblingcoreid = (unsigned) tmpint;
+	  threadwithcoreid = (siblingcoreid != mycoreid);
+	}
+	if (hwloc_bitmap_first(coreset) != i)
+	  notfirstofcore = 1;
+	if (!notfirstofcore || threadwithcoreid) {
+	  /* regular core */
+	  struct hwloc_obj *core;
+
+	  if (!gotcoreid) {
+	    mycoreid = (unsigned) -1;
+	    sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.19 */
+	    if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
+	      mycoreid = (unsigned) tmpint;
+	  }
 
-/*
- * look for infiniband objects below a pcidev in sysfs
- */
-static void
-hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
-				       struct hwloc_obj *obj, const char *osdevpath)
-{
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  int root_fd = data->root_fd;
-  FILE *fd;
-  char path[256];
-  unsigned i,j;
+	  core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, mycoreid);
+	  if (threadwithcoreid)
+	    /* amd multicore compute-unit, create one core per thread */
+	    hwloc_bitmap_only(coreset, i);
+	  core->cpuset = coreset;
+	  hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+				  mycoreid, core->cpuset);
+	  hwloc_insert_object_by_cpuset(topology, core);
+	  coreset = NULL; /* don't free it */
+	} else
 
-  snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char guidvalue[20];
-    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
-      size_t len;
-      len = strspn(guidvalue, "0123456789abcdefx:");
-      assert(len == 19);
-      guidvalue[len] = '\0';
-      hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
+	hwloc_bitmap_free(coreset);
+      }
     }
-    fclose(fd);
-  }
 
-  snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char guidvalue[20];
-    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
-      size_t len;
-      len = strspn(guidvalue, "0123456789abcdefx:");
-      assert(len == 19);
-      guidvalue[len] = '\0';
-      hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
+    if (!notfirstofcore /* don't look at the package unless we are the first of the core */
+	&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_DIE)) {
+      /* look at the die */
+      sprintf(str, "%s/cpu%d/topology/die_cpus", path, i);
+      dieset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+      if (dieset) {
+	hwloc_bitmap_and(dieset, dieset, cpuset);
+	if (hwloc_bitmap_first(dieset) != i) {
+	  /* not first cpu in this die, ignore the die */
+	  hwloc_bitmap_free(dieset);
+	  dieset = NULL;
+	  notfirstofdie = 1;
+	}
+	/* look at packages before deciding whether we keep that die or not */
+      }
     }
-    fclose(fd);
-  }
 
-  for(i=1; ; i++) {
-    snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
-    fd = hwloc_fopen(path, "r", root_fd);
-    if (fd) {
-      char statevalue[2];
-      if (fgets(statevalue, sizeof(statevalue), fd)) {
-	char statename[32];
-	statevalue[1] = '\0'; /* only keep the first byte/digit */
-	snprintf(statename, sizeof(statename), "Port%uState", i);
-	hwloc_obj_add_info(obj, statename, statevalue);
+    if (!notfirstofdie /* don't look at the package unless we are the first of the die */
+	&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+      /* look at the package */
+      hwloc_bitmap_t packageset;
+      if (old_filenames)
+	sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
+      else
+	sprintf(str, "%s/cpu%d/topology/package_cpus", path, i);
+      packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+      if (packageset) {
+	hwloc_bitmap_and(packageset, packageset, cpuset);
+	if (dieset && hwloc_bitmap_isequal(packageset, dieset)) {
+	  /* die is identical to package, ignore it */
+	  hwloc_bitmap_free(dieset);
+	  dieset = NULL;
+	}
+	if (hwloc_bitmap_first(packageset) == i) {
+	  /* first cpu in this package, add the package */
+	  struct hwloc_obj *package;
+	  unsigned mypackageid;
+	  mypackageid = (unsigned) -1;
+	  sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.19 */
+	  if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
+	    mypackageid = (unsigned) tmpint;
+
+	  package = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, mypackageid);
+	  package->cpuset = packageset;
+	  hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+				  mypackageid, packageset);
+	  /* add cpuinfo */
+	  if (cpuinfo_Lprocs) {
+	    for(j=0; j<(int) cpuinfo_numprocs; j++)
+	      if ((int) cpuinfo_Lprocs[j].Pproc == i) {
+		hwloc__move_infos(&package->infos, &package->infos_count,
+				  &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
+	      }
+	  }
+	  hwloc_insert_object_by_cpuset(topology, package);
+	  packageset = NULL; /* don't free it */
+	}
+	hwloc_bitmap_free(packageset);
       }
-      fclose(fd);
-    } else {
-      /* no such port */
-      break;
     }
 
-    snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
-    fd = hwloc_fopen(path, "r", root_fd);
-    if (fd) {
-      char lidvalue[11];
-      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
-	char lidname[32];
-	size_t len;
-	len = strspn(lidvalue, "0123456789abcdefx");
-	lidvalue[len] = '\0';
-	snprintf(lidname, sizeof(lidname), "Port%uLID", i);
-	hwloc_obj_add_info(obj, lidname, lidvalue);
-      }
-      fclose(fd);
+    if (dieset) {
+      struct hwloc_obj *die;
+      unsigned mydieid;
+      mydieid = (unsigned) -1;
+      sprintf(str, "%s/cpu%d/topology/die_id", path, i); /* contains %d when added in 5.2 */
+      if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
+	mydieid = (unsigned) tmpint;
+
+      die = hwloc_alloc_setup_object(topology, HWLOC_OBJ_DIE, mydieid);
+      die->cpuset = dieset;
+      hwloc_debug_1arg_bitmap("os die %u has cpuset %s\n",
+			      mydieid, dieset);
+      hwloc_insert_object_by_cpuset(topology, die);
     }
 
-    snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
-    fd = hwloc_fopen(path, "r", root_fd);
-    if (fd) {
-      char lidvalue[11];
-      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
-	char lidname[32];
-	size_t len;
-	len = strspn(lidvalue, "0123456789");
-	lidvalue[len] = '\0';
-	snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
-	hwloc_obj_add_info(obj, lidname, lidvalue);
+    if (data->arch == HWLOC_LINUX_ARCH_S390
+	&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+      /* look at the books */
+      hwloc_bitmap_t bookset, drawerset;
+      sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
+      bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+      if (bookset) {
+	hwloc_bitmap_and(bookset, bookset, cpuset);
+	if (hwloc_bitmap_first(bookset) == i) {
+	  struct hwloc_obj *book;
+	  unsigned mybookid;
+	  mybookid = (unsigned) -1;
+	  sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.19 */
+	  if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
+	    mybookid = (unsigned) tmpint;
+
+	    book = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, mybookid);
+	    book->cpuset = bookset;
+	    hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
+				    mybookid, bookset);
+	    book->subtype = strdup("Book");
+	    book->attr->group.kind = HWLOC_GROUP_KIND_S390_BOOK;
+	    book->attr->group.subkind = 0;
+	    hwloc_insert_object_by_cpuset(topology, book);
+	    bookset = NULL; /* don't free it */
+	  }
+        }
+	hwloc_bitmap_free(bookset);
+
+	sprintf(str, "%s/cpu%d/topology/drawer_siblings", path, i);
+	drawerset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+	if (drawerset) {
+	  hwloc_bitmap_and(drawerset, drawerset, cpuset);
+	  if (hwloc_bitmap_first(drawerset) == i) {
+	    struct hwloc_obj *drawer;
+	    unsigned mydrawerid;
+	    mydrawerid = (unsigned) -1;
+	    sprintf(str, "%s/cpu%d/topology/drawer_id", path, i); /* contains %d at least up to 4.19 */
+	    if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
+	      mydrawerid = (unsigned) tmpint;
+
+	      drawer = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, mydrawerid);
+	      drawer->cpuset = drawerset;
+	      hwloc_debug_1arg_bitmap("os drawer %u has cpuset %s\n",
+				      mydrawerid, drawerset);
+	      drawer->subtype = strdup("Drawer");
+	      drawer->attr->group.kind = HWLOC_GROUP_KIND_S390_BOOK;
+	      drawer->attr->group.subkind = 1;
+	      hwloc_insert_object_by_cpuset(topology, drawer);
+	      drawerset = NULL; /* don't free it */
+	    }
+	  }
+	  hwloc_bitmap_free(drawerset);
+	}
       }
-      fclose(fd);
     }
 
-    for(j=0; ; j++) {
-      snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
-      fd = hwloc_fopen(path, "r", root_fd);
-      if (fd) {
-	char gidvalue[40];
-	if (fgets(gidvalue, sizeof(gidvalue), fd)) {
-	  char gidname[32];
-	  size_t len;
-	  len = strspn(gidvalue, "0123456789abcdefx:");
-	  assert(len == 39);
-	  gidvalue[len] = '\0';
-	  if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
-	    /* only keep initialized GIDs */
-	    snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
-	    hwloc_obj_add_info(obj, gidname, gidvalue);
+    /* PU cannot be filtered-out */
+    {
+      /* look at the thread */
+      hwloc_bitmap_t threadset;
+      struct hwloc_obj *thread = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, (unsigned) i);
+      threadset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(threadset, i);
+      thread->cpuset = threadset;
+      hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
+		 i, threadset);
+      hwloc_insert_object_by_cpuset(topology, thread);
+    }
+
+    /* look at the caches */
+    for(j=0; j<10; j++) {
+      char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
+      hwloc_bitmap_t cacheset;
+
+      sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
+      cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+      if (cacheset) {
+	if (hwloc_bitmap_iszero(cacheset)) {
+	  /* ia64 returning empty L3 and L2i? use the core set instead */
+	  hwloc_bitmap_t tmpset;
+	  if (old_filenames)
+	    sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
+	  else
+	    sprintf(str, "%s/cpu%d/topology/core_cpus", path, i);
+	  tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
+	  /* only use it if we actually got something */
+	  if (tmpset) {
+	    hwloc_bitmap_free(cacheset);
+	    cacheset = tmpset;
 	  }
 	}
-	fclose(fd);
-      } else {
-	/* no such port */
-	break;
+	hwloc_bitmap_and(cacheset, cacheset, cpuset);
+
+	if (hwloc_bitmap_first(cacheset) == i) {
+	  unsigned kB;
+	  unsigned linesize;
+	  unsigned sets, lines_per_tag;
+	  unsigned depth; /* 1 for L1, .... */
+	  hwloc_obj_cache_type_t ctype = HWLOC_OBJ_CACHE_UNIFIED; /* default */
+	  hwloc_obj_type_t otype;
+	  struct hwloc_obj *cache;
+
+	  /* get the cache level depth */
+	  sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.19 */
+	  if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
+	    hwloc_bitmap_free(cacheset);
+	    continue;
+	  }
+
+	  /* cache type */
+	  sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
+	  if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
+	    if (!strncmp(str2, "Data", 4))
+	      ctype = HWLOC_OBJ_CACHE_DATA;
+	    else if (!strncmp(str2, "Unified", 7))
+	      ctype = HWLOC_OBJ_CACHE_UNIFIED;
+	    else if (!strncmp(str2, "Instruction", 11))
+	      ctype = HWLOC_OBJ_CACHE_INSTRUCTION;
+	  }
+
+	  otype = hwloc_cache_type_by_depth_type(depth, ctype);
+	  if (otype == HWLOC_OBJ_TYPE_NONE
+	      || !hwloc_filter_check_keep_object_type(topology, otype)) {
+	    hwloc_bitmap_free(cacheset);
+	    continue;
+	  }
+
+	  /* FIXME: if Bulldozer/Piledriver, add compute unit Groups when L2/L1i filtered-out */
+	  /* FIXME: if KNL, add tile Groups when L2/L1i filtered-out */
+
+	  /* get the cache size */
+	  kB = 0;
+	  sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.19 */
+	  hwloc_read_path_as_uint(str, &kB, data->root_fd);
+	  /* KNL reports L3 with size=0 and full cpuset in cpuid.
+	   * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
+	   */
+	  if (!kB && otype == HWLOC_OBJ_L3CACHE && data->is_knl) {
+	    hwloc_bitmap_free(cacheset);
+	    continue;
+	  }
+
+	  /* get the line size */
+	  linesize = 0;
+	  sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.19 */
+	  hwloc_read_path_as_uint(str, &linesize, data->root_fd);
+
+	  /* get the number of sets and lines per tag.
+	   * don't take the associativity directly in "ways_of_associativity" because
+	   * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
+	   */
+	  sets = 0;
+	  sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.19 */
+	  hwloc_read_path_as_uint(str, &sets, data->root_fd);
+
+	  lines_per_tag = 1;
+	  sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.19 */
+	  hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
+
+	  /* first cpu in this cache, add the cache */
+	  cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
+	  cache->attr->cache.size = ((uint64_t)kB) << 10;
+	  cache->attr->cache.depth = depth;
+	  cache->attr->cache.linesize = linesize;
+	  cache->attr->cache.type = ctype;
+	  if (!linesize || !lines_per_tag || !sets)
+	    cache->attr->cache.associativity = 0; /* unknown */
+	  else if (sets == 1)
+	    cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
+	  else
+	    cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
+	  cache->cpuset = cacheset;
+	  hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
+				  depth, cacheset);
+	  hwloc_insert_object_by_cpuset(topology, cache);
+	  cacheset = NULL; /* don't free it */
+	  ++caches_added;
+	}
       }
-    }
-  }
+      hwloc_bitmap_free(cacheset);
+     }
+
+  } hwloc_bitmap_foreach_end();
+
+  if (0 == caches_added && data->use_dt)
+    look_powerpc_device_tree(topology, data);
+
+  hwloc_bitmap_free(cpuset);
+  hwloc_bitmap_free(online_set);
+
+  return 0;
 }
 
+
+
+/****************************************
+ ****** cpuinfo Topology Discovery ******
+ ****************************************/
+
 static int
-hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
-				     struct hwloc_obj *pcidev, const char *pcidevpath)
+hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
+			      struct hwloc_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
 {
-  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
+  if (!strcmp("vendor_id", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("cpu family", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  } else if (!strcmp("stepping", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUStepping", value);
+  }
+  return 0;
 }
 
-/* look for dma objects below a pcidev in sysfs */
 static int
-hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
-			     struct hwloc_obj *pcidev, const char *pcidevpath)
+hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
+			       struct hwloc_info_s **infos, unsigned *infos_count,
+			       int is_global __hwloc_attribute_unused)
 {
-  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
+  if (!strcmp("vendor", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("family", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  }
+  return 0;
 }
 
-/* look for drm objects below a pcidev in sysfs */
 static int
-hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
-			     struct hwloc_obj *pcidev, const char *pcidevpath)
+hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
+			      struct hwloc_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
 {
-  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
-
-  /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
+  if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
+      || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("CPU implementer", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUImplementer", value);
+  } else if (!strcmp("CPU architecture", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
+  } else if (!strcmp("CPU variant", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUVariant", value);
+  } else if (!strcmp("CPU part", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUPart", value);
+  } else if (!strcmp("CPU revision", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPURevision", value);
+  } else if (!strcmp("Hardware", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "HardwareName", value);
+  } else if (!strcmp("Revision", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "HardwareRevision", value);
+  } else if (!strcmp("Serial", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "HardwareSerial", value);
+  }
+  return 0;
+}
 
-  /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
-   * so we could create a OS device for each PCI devices with such a field.
-   * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
-   */
+static int
+hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
+			      struct hwloc_info_s **infos, unsigned *infos_count,
+			      int is_global)
+{
+  /* common fields */
+  if (!strcmp("cpu", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("platform", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "PlatformName", value);
+  } else if (!strcmp("model", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "PlatformModel", value);
+  }
+  /* platform-specific fields */
+  else if (!strcasecmp("vendor", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "PlatformVendor", value);
+  } else if (!strcmp("Board ID", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
+  } else if (!strcmp("Board", prefix)
+	     || !strcasecmp("Machine", prefix)) {
+    /* machine and board are similar (and often more precise) than model above */
+    if (value[0])
+      hwloc__add_info_nodup(infos, infos_count, "PlatformModel", value, 1);
+  } else if (!strcasecmp("Revision", prefix)
+	     || !strcmp("Hardware rev", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
+  } else if (!strcmp("SVR", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
+  } else if (!strcmp("PVR", prefix)) {
+    if (value[0])
+      hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
+  }
+  /* don't match 'board*' because there's also "board l2" on some platforms */
+  return 0;
 }
 
 /*
- * look for block objects below a pcidev in sysfs
+ * avr32: "chip type\t:"			=> OK
+ * blackfin: "model name\t:"			=> OK
+ * h8300: "CPU:"				=> OK
+ * m68k: "CPU:"					=> OK
+ * mips: "cpu model\t\t:"			=> OK
+ * openrisc: "CPU:"				=> OK
+ * sparc: "cpu\t\t:"				=> OK
+ * tile: "model name\t:"			=> OK
+ * unicore32: "Processor\t:"			=> OK
+ * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:"	=> "cpu" overwritten by "cpu model", no processor indexes
+ * cris: "cpu\t\t:" + "cpu model\t:"		=> only "cpu"
+ * frv: "CPU-Core:" + "CPU:"			=> only "CPU"
+ * mn10300: "cpu core   :" + "model name :"	=> only "model name"
+ * parisc: "cpu family\t:" + "cpu\t\t:"		=> only "cpu"
+ *
+ * not supported because of conflicts with other arch minor lines:
+ * m32r: "cpu family\t:"			=> KO (adding "cpu family" would break "blackfin")
+ * microblaze: "CPU-Family:"			=> KO
+ * sh: "cpu family\t:" + "cpu type\t:"		=> KO
+ * xtensa: "model\t\t:"				=> KO
  */
-
-static void
-hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
-				  struct hwloc_obj *obj, const char *osdevpath)
+static int
+hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
+				  struct hwloc_info_s **infos, unsigned *infos_count,
+				  int is_global __hwloc_attribute_unused)
 {
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  int root_fd = data->root_fd;
+  if (!strcmp("model name", prefix)
+      || !strcmp("Processor", prefix)
+      || !strcmp("chip type", prefix)
+      || !strcmp("cpu model", prefix)
+      || !strcasecmp("cpu", prefix)) {
+    /* keep the last one, assume it's more precise than the first one.
+     * we should have the Architecture keypair for basic information anyway.
+     */
+    if (value[0])
+      hwloc__add_info_nodup(infos, infos_count, "CPUModel", value, 1);
+  }
+  return 0;
+}
+
+/* Lprocs_p set to NULL unless returns > 0 */
+static int
+hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
+			  const char *path,
+			  struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
+			  struct hwloc_info_s **global_infos, unsigned *global_infos_count)
+{
+  /* FIXME: only parse once per package and once for globals? */
   FILE *fd;
+  char str[128]; /* vendor/model can be very long */
+  char *endptr;
+  unsigned allocated_Lprocs = 0;
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  unsigned numprocs = 0;
+  int curproc = -1;
+  int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_info_s **, unsigned *, int) = NULL;
+
+  if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
+    {
+      hwloc_debug("could not open %s\n", path);
+      return -1;
+    }
+
+#      define PROCESSOR	"processor"
+  hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
+  while (fgets(str, sizeof(str), fd)!=NULL) {
+    unsigned long Pproc;
+    char *end, *dot, *prefix, *value;
+    int noend = 0;
+
+    /* remove the ending \n */
+    end = strchr(str, '\n');
+    if (end)
+      *end = 0;
+    else
+      noend = 1;
+    /* if empty line, skip and reset curproc */
+    if (!*str) {
+      curproc = -1;
+      continue;
+    }
+    /* skip lines with no dot */
+    dot = strchr(str, ':');
+    if (!dot)
+      continue;
+    /* skip lines not starting with a letter */
+    if ((*str > 'z' || *str < 'a')
+	&& (*str > 'Z' || *str < 'A'))
+      continue;
+
+    /* mark the end of the prefix */
+    prefix = str;
+    end = dot;
+    while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
+    *end = 0;
+    /* find beginning of value, its end is already marked */
+    value = dot+1 + strspn(dot+1, " \t");
+
+    /* defines for parsing numbers */
+#   define getprocnb_begin(field, var)					\
+    if (!strcmp(field,prefix)) {					\
+      var = strtoul(value,&endptr,0);					\
+      if (endptr==value) {						\
+	hwloc_debug("no number in "field" field of %s\n", path);	\
+	goto err;							\
+      } else if (var==ULONG_MAX) {					\
+	hwloc_debug("too big "field" number in %s\n", path); 		\
+	goto err;							\
+      }									\
+      hwloc_debug(field " %lu\n", var)
+#   define getprocnb_end()						\
+    }
+    /* actually parse numbers */
+    getprocnb_begin(PROCESSOR, Pproc);
+    curproc = numprocs++;
+    if (numprocs > allocated_Lprocs) {
+      struct hwloc_linux_cpuinfo_proc * tmp;
+      if (!allocated_Lprocs)
+	allocated_Lprocs = 8;
+      else
+        allocated_Lprocs *= 2;
+      tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
+      if (!tmp)
+	goto err;
+      Lprocs = tmp;
+    }
+    Lprocs[curproc].Pproc = Pproc;
+    Lprocs[curproc].infos = NULL;
+    Lprocs[curproc].infos_count = 0;
+    getprocnb_end() else {
+
+      /* architecture specific or default routine for parsing cpumodel */
+      switch (data->arch) {
+      case HWLOC_LINUX_ARCH_X86:
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
+	break;
+      case HWLOC_LINUX_ARCH_ARM:
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
+	break;
+      case HWLOC_LINUX_ARCH_POWER:
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
+	break;
+      case HWLOC_LINUX_ARCH_IA64:
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
+	break;
+      default:
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
+      }
+
+      /* we can't assume that we already got a processor index line:
+       * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
+       * tile has a global section with model name before the list of processor lines.
+       */
+      parse_cpuinfo_func(prefix, value,
+			 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
+			 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
+			 curproc < 0);
+    }
+
+    if (noend) {
+      /* ignore end of line */
+      if (fscanf(fd,"%*[^\n]") == EOF)
+	break;
+      getc(fd);
+    }
+  }
+  fclose(fd);
+
+  *Lprocs_p = Lprocs;
+  return numprocs;
+
+ err:
+  fclose(fd);
+  free(Lprocs);
+  *Lprocs_p = NULL;
+  return -1;
+}
+
+static void
+hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
+			 struct hwloc_info_s *global_infos, unsigned global_infos_count)
+{
+  if (Lprocs) {
+    unsigned i;
+    for(i=0; i<numprocs; i++) {
+      hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
+    }
+    free(Lprocs);
+  }
+  hwloc__free_infos(global_infos, global_infos_count);
+}
+
+
+
+/*************************************
+ ****** Main Topology Discovery ******
+ *************************************/
+
+static void
+hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
+{
+  char line[64], *tmp, *end;
+  if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
+    return;
+  if (strncmp(line, "Card ", 5))
+    return;
+  tmp = line + 5;
+  end = strchr(tmp, ':');
+  if (!end)
+    return;
+  *end = '\0';
+
+  if (tmp[0])
+    hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
+}
+
+static void
+hwloc_gather_system_info(struct hwloc_topology *topology,
+			 struct hwloc_linux_backend_data_s *data)
+{
+  FILE *file;
+  char line[128]; /* enough for utsname fields */
+  const char *env;
+
+  /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
+  memset(&data->utsname, 0, sizeof(data->utsname));
+  data->fallback_nbprocessors = -1; /* unknown yet */
+  data->pagesize = 4096;
+
+  /* read thissystem info */
+  if (topology->is_thissystem) {
+    uname(&data->utsname);
+    data->fallback_nbprocessors = hwloc_fallback_nbprocessors(0); /* errors managed in hwloc_linux_fallback_pu_level() */
+    data->pagesize = hwloc_getpagesize();
+  }
+
+  if (!data->is_real_fsroot) {
+   /* overwrite with optional /proc/hwloc-nofile-info */
+   file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
+   if (file) {
+    while (fgets(line, sizeof(line), file)) {
+      char *tmp = strchr(line, '\n');
+      if (!strncmp("OSName: ", line, 8)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
+	data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
+      } else if (!strncmp("OSRelease: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
+	data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
+      } else if (!strncmp("OSVersion: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
+	data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
+      } else if (!strncmp("HostName: ", line, 10)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
+	data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
+      } else if (!strncmp("Architecture: ", line, 14)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
+	data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
+      } else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
+	if (tmp)
+	  *tmp = '\0';
+	data->fallback_nbprocessors = atoi(line+22);
+      } else if (!strncmp("PageSize: ", line, 10)) {
+	if (tmp)
+	 *tmp = '\0';
+	data->pagesize = strtoull(line+10, NULL, 10);
+      } else {
+	hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
+	/* ignored */
+      }
+    }
+    fclose(file);
+   }
+  }
+
+  env = getenv("HWLOC_DUMP_NOFILE_INFO");
+  if (env && *env) {
+    file = fopen(env, "w");
+    if (file) {
+      if (*data->utsname.sysname)
+	fprintf(file, "OSName: %s\n", data->utsname.sysname);
+      if (*data->utsname.release)
+	fprintf(file, "OSRelease: %s\n", data->utsname.release);
+      if (*data->utsname.version)
+	fprintf(file, "OSVersion: %s\n", data->utsname.version);
+      if (*data->utsname.nodename)
+	fprintf(file, "HostName: %s\n", data->utsname.nodename);
+      if (*data->utsname.machine)
+	fprintf(file, "Architecture: %s\n", data->utsname.machine);
+      fprintf(file, "FallbackNbProcessors: %d\n", data->fallback_nbprocessors);
+      fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
+      fclose(file);
+    }
+  }
+
+  /* detect arch for quirks, using configure #defines if possible, or uname */
+#if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
+  if (topology->is_thissystem)
+    data->arch = HWLOC_LINUX_ARCH_X86;
+#endif
+  if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
+    if (!strcmp(data->utsname.machine, "x86_64")
+	|| (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
+	|| !strcmp(data->utsname.machine, "k1om"))
+      data->arch = HWLOC_LINUX_ARCH_X86;
+    else if (!strncmp(data->utsname.machine, "arm", 3))
+      data->arch = HWLOC_LINUX_ARCH_ARM;
+    else if (!strncmp(data->utsname.machine, "ppc", 3)
+	     || !strncmp(data->utsname.machine, "power", 5))
+      data->arch = HWLOC_LINUX_ARCH_POWER;
+    else if (!strncmp(data->utsname.machine, "s390", 4))
+      data->arch = HWLOC_LINUX_ARCH_S390;
+    else if (!strcmp(data->utsname.machine, "ia64"))
+      data->arch = HWLOC_LINUX_ARCH_IA64;
+  }
+}
+
+/* returns 0 on success, -1 on non-match or error during hardwired load */
+static int
+hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+
+  if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
+    return -1;
+
+  if (!strcmp(data->utsname.machine, "s64fx")) {
+    char line[128];
+    /* Fujistu K-computer, FX10, and FX100 use specific processors
+     * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
+     * and existing machines will likely never be fixed by kernel upgrade.
+     */
+
+    /* /proc/cpuinfo starts with one of these lines:
+     * "cpu             : Fujitsu SPARC64 VIIIfx"
+     * "cpu             : Fujitsu SPARC64 XIfx"
+     * "cpu             : Fujitsu SPARC64 IXfx"
+     */
+    if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
+      return -1;
+
+    if (strncmp(line, "cpu\t", 4))
+      return -1;
+
+    if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
+      return hwloc_look_hardwired_fujitsu_k(topology);
+    else if (strstr(line, "Fujitsu SPARC64 IXfx"))
+      return hwloc_look_hardwired_fujitsu_fx10(topology);
+    else if (strstr(line, "FUJITSU SPARC64 XIfx"))
+      return hwloc_look_hardwired_fujitsu_fx100(topology);
+  }
+  return -1;
+}
+
+static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
+{
+  char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
+
+  hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
+  if (cgroup_mntpnt || cpuset_mntpnt) {
+    cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
+    if (cpuset_name) {
+      hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->allowed_cpuset);
+      hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->allowed_nodeset);
+    }
+    free(cgroup_mntpnt);
+    free(cpuset_mntpnt);
+  }
+  *cpuset_namep = cpuset_name;
+}
+
+static void
+hwloc_linux_fallback_pu_level(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+
+  if (data->fallback_nbprocessors >= 1)
+    topology->support.discovery->pu = 1;
+  else
+    data->fallback_nbprocessors = 1;
+  hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
+}
+
+static const char *find_sysfs_cpu_path(int root_fd, int *old_filenames)
+{
+  if (!hwloc_access("/sys/bus/cpu/devices", R_OK|X_OK, root_fd)) {
+    if (!hwloc_access("/sys/bus/cpu/devices/cpu0/topology/package_cpus", R_OK, root_fd)
+	|| !hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_cpus", R_OK, root_fd)) {
+      return "/sys/bus/cpu/devices";
+    }
+
+    if (!hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, root_fd)
+	|| !hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, root_fd)) {
+      *old_filenames = 1;
+      return "/sys/bus/cpu/devices";
+    }
+  }
+
+  if (!hwloc_access("/sys/devices/system/cpu", R_OK|X_OK, root_fd)) {
+    if (!hwloc_access("/sys/devices/system/cpu/cpu0/topology/package_cpus", R_OK, root_fd)
+	|| !hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_cpus", R_OK, root_fd)) {
+      return "/sys/devices/system/cpu";
+    }
+
+    if (!hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, root_fd)
+	|| !hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, root_fd)) {
+      *old_filenames = 1;
+      return "/sys/devices/system/cpu";
+    }
+  }
+
+  return NULL;
+}
+
+static const char *find_sysfs_node_path(int root_fd)
+{
+  if (!hwloc_access("/sys/bus/node/devices", R_OK|X_OK, root_fd)
+      && !hwloc_access("/sys/bus/node/devices/node0/cpumap", R_OK, root_fd))
+    return "/sys/bus/node/devices";
+
+  if (!hwloc_access("/sys/devices/system/node", R_OK|X_OK, root_fd)
+      && !hwloc_access("/sys/devices/system/node/node0/cpumap", R_OK, root_fd))
+    return "/sys/devices/system/node";
+
+  return NULL;
+}
+
+static int
+hwloc_linuxfs_look_cpu(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
+{
+  /*
+   * This backend may be used with topology->is_thissystem set (default)
+   * or not (modified fsroot path).
+   */
+
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  unsigned nbnodes;
+  char *cpuset_name = NULL;
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  struct hwloc_info_s *global_infos = NULL;
+  unsigned global_infos_count = 0;
+  int numprocs;
+  int already_pus;
+  int already_numanodes;
+  const char *sysfs_cpu_path;
+  const char *sysfs_node_path;
+  int old_siblings_filenames = 0;
+  int err;
+
+  /* look for sysfs cpu path containing at least one of core_siblings and thread_siblings */
+  sysfs_cpu_path = find_sysfs_cpu_path(data->root_fd, &old_siblings_filenames);
+  hwloc_debug("Found sysfs cpu files under %s with %s topology filenames\n",
+	      sysfs_cpu_path, old_siblings_filenames ? "old" : "new");
+
+  /* look for sysfs node path */
+  sysfs_node_path = find_sysfs_node_path(data->root_fd);
+  hwloc_debug("Found sysfs node files under %s\n",
+	      sysfs_node_path);
+
+  already_pus = (topology->levels[0][0]->complete_cpuset != NULL
+		 && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
+  /* if there are PUs, still look at memory information
+   * since x86 misses NUMA node information (unless we forced AMD topoext NUMA nodes)
+   * memory size.
+   */
+  already_numanodes = (topology->levels[0][0]->complete_nodeset != NULL
+		       && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset));
+  /* if there are already NUMA nodes, we'll just annotate them with memory information,
+   * which requires the NUMA level to be connected.
+   */
+  if (already_numanodes)
+    hwloc_topology_reconnect(topology, 0);
+
+  hwloc_alloc_root_sets(topology->levels[0][0]);
+
+  /*********************************
+   * Platform information for later
+   */
+  hwloc_gather_system_info(topology, data);
+
+  /**********************
+   * /proc/cpuinfo
+   */
+  numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
+  if (numprocs < 0)
+    numprocs = 0;
+
+  /**************************
+   * detect model for quirks
+   */
+  if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
+      unsigned i;
+      const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
+      for(i=0; i<Lprocs[0].infos_count; i++) {
+	if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
+	  cpuvendor = Lprocs[0].infos[i].value;
+	} else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
+	  cpufamilynumber = Lprocs[0].infos[i].value;
+	} else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
+	  cpumodelnumber = Lprocs[0].infos[i].value;
+	}
+      }
+      if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
+	  && cpufamilynumber && !strcmp(cpufamilynumber, "6")
+	  && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
+	  || !strcmp(cpumodelnumber, "133")))
+	data->is_knl = 1;
+      if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
+	  && cpufamilynumber
+	  && (!strcmp(cpufamilynumber, "21")
+	      || !strcmp(cpufamilynumber, "22")))
+	data->is_amd_with_CU = 1;
+  }
+
+  /**********************
+   * Gather the list of admin-disabled cpus and mems
+   */
+  if (!(dstatus->flags & HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES)) {
+    hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
+    dstatus->flags |= HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES;
+  }
+
+  /**********************
+   * CPU information
+   */
+
+  /* Don't rediscover CPU resources if already done */
+  if (already_pus)
+    goto cpudone;
+
+  /* Gather the list of cpus now */
+  err = hwloc_linux_try_hardwired_cpuinfo(backend);
+  if (!err)
+    goto cpudone;
+
+  /* setup root info */
+  hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+		    &global_infos, &global_infos_count);
+
+  if (!sysfs_cpu_path) {
+    /* /sys/.../topology unavailable (before 2.6.16)
+     * or not containing anything interesting */
+    hwloc_linux_fallback_pu_level(backend);
+    if (data->use_dt)
+      look_powerpc_device_tree(topology, data);
+
+  } else {
+    /* sysfs */
+    if (look_sysfscpu(topology, data, sysfs_cpu_path, old_siblings_filenames, Lprocs, numprocs) < 0)
+      /* sysfs but we failed to read cpu topology, fallback */
+      hwloc_linux_fallback_pu_level(backend);
+  }
+
+ cpudone:
+
+  /*********************
+   * Memory information
+   */
+
+  /* Get the machine memory attributes */
+  hwloc_get_machine_meminfo(data, &topology->machine_memory);
+
+  /* Gather NUMA information. */
+  if (sysfs_node_path) {
+    if (hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE) > 0)
+      annotate_sysfsnode(topology, data, sysfs_node_path, &nbnodes);
+    else
+      look_sysfsnode(topology, data, sysfs_node_path, &nbnodes);
+  } else
+    nbnodes = 0;
+
+  /**********************
+   * Misc
+   */
+
+  /* Gather DMI info */
+  hwloc__get_dmi_id_info(data, topology->levels[0][0]);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
+  if (cpuset_name) {
+    hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
+    free(cpuset_name);
+  }
+
+  hwloc__linux_get_mic_sn(topology, data);
+
+  /* data->utsname was filled with real uname or \0, we can safely pass it */
+  hwloc_add_uname_info(topology, &data->utsname);
+
+  hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+  return 0;
+}
+
+
+
+/****************************************
+ ***** Linux PCI backend callbacks ******
+ ****************************************/
+
+/*
+ * backend callback for retrieving the location of a pci device
+ */
+static int
+hwloc_linux_backend_get_pci_busid_cpuset(struct hwloc_backend *backend,
+					 struct hwloc_pcidev_attr_s *busid, hwloc_bitmap_t cpuset)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
   char path[256];
+  int err;
+
+  snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
+	   busid->domain, busid->bus,
+	   busid->dev, busid->func);
+  err = hwloc__read_path_as_cpumask(path, cpuset, data->root_fd);
+  if (!err && !hwloc_bitmap_iszero(cpuset))
+    return 0;
+  return -1;
+}
+
+
+
+#ifdef HWLOC_HAVE_LINUXIO
+
+/***********************************
+ ******* Linux I/O discovery *******
+ ***********************************/
+
+#define HWLOC_LINUXFS_OSDEV_FLAG_FIND_VIRTUAL (1U<<0)
+#define HWLOC_LINUXFS_OSDEV_FLAG_FIND_USB (1U<<1)
+#define HWLOC_LINUXFS_OSDEV_FLAG_BLOCK_WITH_SECTORS (1U<<2)
+#define HWLOC_LINUXFS_OSDEV_FLAG_UNDER_BUS (1U<<31)
+
+static hwloc_obj_t
+hwloc_linuxfs_find_osdev_parent(struct hwloc_backend *backend, int root_fd,
+				const char *osdevpath, unsigned osdev_flags)
+{
+  struct hwloc_topology *topology = backend->topology;
+  char path[256], buf[10];
+  int fd;
+  int foundpci;
+  unsigned pcidomain = 0, pcibus = 0, pcidev = 0, pcifunc = 0;
+  unsigned _pcidomain, _pcibus, _pcidev, _pcifunc;
+  hwloc_bitmap_t cpuset;
+  const char *tmp;
+  hwloc_obj_t parent;
+  char *devicesubdir;
+  int err;
+
+  if (osdev_flags & HWLOC_LINUXFS_OSDEV_FLAG_UNDER_BUS)
+    devicesubdir = "..";
+  else
+    devicesubdir = "device";
+
+  err = hwloc_readlink(osdevpath, path, sizeof(path), root_fd);
+  if (err < 0) {
+    /* /sys/class/<class>/<name> is a directory instead of a symlink on old kernels (at least around 2.6.18 and 2.6.25).
+     * The link to parse can be found in /sys/class/<class>/<name>/device instead, at least for "/pci..."
+     */
+    char olddevpath[256];
+    snprintf(olddevpath, sizeof(olddevpath), "%s/device", osdevpath);
+    err = hwloc_readlink(olddevpath, path, sizeof(path), root_fd);
+    if (err < 0)
+      return NULL;
+  }
+  path[err] = '\0';
+
+  if (!(osdev_flags & HWLOC_LINUXFS_OSDEV_FLAG_FIND_VIRTUAL)) {
+    if (strstr(path, "/virtual/"))
+      return NULL;
+  }
+
+  if (!(osdev_flags & HWLOC_LINUXFS_OSDEV_FLAG_FIND_USB)) {
+    if (strstr(path, "/usb"))
+      return NULL;
+  }
+
+  tmp = strstr(path, "/pci");
+  if (!tmp)
+    goto nopci;
+  tmp = strchr(tmp+4, '/');
+  if (!tmp)
+    goto nopci;
+  tmp++;
+
+  /* iterate through busid to find the last one (previous ones are bridges) */
+  foundpci = 0;
+ nextpci:
+  if (sscanf(tmp+1, "%x:%x:%x.%x", &_pcidomain, &_pcibus, &_pcidev, &_pcifunc) == 4) {
+    foundpci = 1;
+    pcidomain = _pcidomain;
+    pcibus = _pcibus;
+    pcidev = _pcidev;
+    pcifunc = _pcifunc;
+    tmp += 13;
+    goto nextpci;
+  }
+  if (sscanf(tmp+1, "%x:%x.%x", &_pcibus, &_pcidev, &_pcifunc) == 3) {
+    foundpci = 1;
+    pcidomain = 0;
+    pcibus = _pcibus;
+    pcidev = _pcidev;
+    pcifunc = _pcifunc;
+    tmp += 8;
+    goto nextpci;
+  }
+
+  if (foundpci) {
+    /* attach to a PCI parent or to a normal (non-I/O) parent found by PCI affinity */
+    parent = hwloc_pci_find_parent_by_busid(topology, pcidomain, pcibus, pcidev, pcifunc);
+    if (parent)
+      return parent;
+  }
+
+ nopci:
+  /* attach directly near the right NUMA node */
+  snprintf(path, sizeof(path), "%s/%s/numa_node", osdevpath, devicesubdir);
+  fd = hwloc_open(path, root_fd);
+  if (fd >= 0) {
+    err = read(fd, buf, sizeof(buf));
+    close(fd);
+    if (err > 0) {
+      int node = atoi(buf);
+      if (node >= 0) {
+	parent = hwloc_get_numanode_obj_by_os_index(topology, (unsigned) node);
+	if (parent) {
+	  /* don't attach I/O under numa node, attach to the same normal parent */
+	  while (hwloc__obj_type_is_memory(parent->type))
+	    parent = parent->parent;
+	  return parent;
+	}
+      }
+    }
+  }
+
+  /* attach directly to the right cpuset */
+  snprintf(path, sizeof(path), "%s/%s/local_cpus", osdevpath, devicesubdir);
+  cpuset = hwloc__alloc_read_path_as_cpumask(path, root_fd);
+  if (cpuset) {
+    parent = hwloc_find_insert_io_parent_by_complete_cpuset(topology, cpuset);
+    hwloc_bitmap_free(cpuset);
+    if (parent)
+      return parent;
+  }
+
+  /* FIXME: {numa_node,local_cpus} may be missing when the device link points to a subdirectory.
+   * For instance, device of scsi blocks may point to foo/ata1/host0/target0:0:0/0:0:0:0/ instead of foo/
+   * In such case, we should look for device/../../../../{numa_node,local_cpus} instead of device/{numa_node,local_cpus}
+   * Not needed yet since scsi blocks use the PCI locality above.
+   */
+
+  /* fallback to the root object */
+  return hwloc_get_root_obj(topology);
+}
+
+static hwloc_obj_t
+hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_OS_DEVICE, HWLOC_UNKNOWN_INDEX);
+  obj->name = strdup(name);
+  obj->attr->osdev.type = type;
+
+  hwloc_insert_object_by_parent(topology, pcidev, obj);
+  /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+
+  return obj;
+}
+
+static void
+hwloc_linuxfs_block_class_fillinfos(struct hwloc_backend *backend __hwloc_attribute_unused, int root_fd,
+				    struct hwloc_obj *obj, const char *osdevpath, unsigned osdev_flags)
+{
+#ifdef HWLOC_HAVE_LIBUDEV
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+#endif
+  FILE *file;
+  char path[296]; /* osdevpath <= 256 */
   char line[128];
   char vendor[64] = "";
   char model[64] = "";
   char serial[64] = "";
   char revision[64] = "";
   char blocktype[64] = "";
+  unsigned sectorsize = 0;
   unsigned major_id, minor_id;
+  char *devicesubdir;
   char *tmp;
 
-  snprintf(path, sizeof(path), "%s/dev", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (!fd)
-    return;
-
-  if (NULL == fgets(line, sizeof(line), fd)) {
-    fclose(fd);
-    return;
+  if (osdev_flags & HWLOC_LINUXFS_OSDEV_FLAG_UNDER_BUS)
+    devicesubdir = "..";
+  else
+    devicesubdir = "device";
+
+  snprintf(path, sizeof(path), "%s/size", osdevpath);
+  if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
+    unsigned long long value = strtoull(line, NULL, 10);
+    /* linux always reports size in 512-byte units for blocks, and bytes for dax, we want kB */
+    snprintf(line, sizeof(line), "%llu",
+	     (osdev_flags & HWLOC_LINUXFS_OSDEV_FLAG_BLOCK_WITH_SECTORS) ? value / 2 : value >> 10);
+    hwloc_obj_add_info(obj, "Size", line);
+  }
+
+  snprintf(path, sizeof(path), "%s/queue/hw_sector_size", osdevpath);
+  if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
+    sectorsize = strtoul(line, NULL, 10);
+  }
+
+  snprintf(path, sizeof(path), "%s/%s/devtype", osdevpath, devicesubdir);
+  if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
+    /* non-volatile devices use the following subtypes:
+     * nd_namespace_pmem for pmem/raw (/dev/pmemX)
+     * nd_btt for pmem/sector (/dev/pmemXs)
+     * nd_pfn for pmem/fsdax (/dev/pmemX)
+     * nd_dax for pmem/devdax (/dev/daxX) but it's not a block device anyway
+     * nd_namespace_blk for blk/raw and blk/sector (/dev/ndblkX) ?
+     *
+     * Note that device/sector_size in btt devices includes integrity metadata
+     * (512/4096 block + 0/N) while queue/hw_sector_size above is the user sectorsize
+     * without metadata.
+     */
+    if (!strncmp(line, "nd_", 3))
+      strcpy(blocktype, "NVDIMM"); /* Save the blocktype now since udev reports "" so far */
+  }
+  if (sectorsize) {
+    snprintf(line, sizeof(line), "%u", sectorsize);
+    hwloc_obj_add_info(obj, "SectorSize", line);
   }
-  fclose(fd);
 
+  snprintf(path, sizeof(path), "%s/dev", osdevpath);
+  if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
+    goto done;
   if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
-    return;
+    goto done;
   tmp = strchr(line, '\n');
   if (tmp)
     *tmp = '\0';
   hwloc_obj_add_info(obj, "LinuxDeviceID", line);
 
-#ifdef HAVE_LIBUDEV_H
+#ifdef HWLOC_HAVE_LIBUDEV
   if (data->udev) {
     struct udev_device *dev;
     const char *prop;
     dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
     if (!dev)
-      return;
+      goto done;
     prop = udev_device_get_property_value(dev, "ID_VENDOR");
-    if (prop)
-      strcpy(vendor, prop);
+    if (prop) {
+      strncpy(vendor, prop, sizeof(vendor));
+      vendor[sizeof(vendor)-1] = '\0';
+    }
     prop = udev_device_get_property_value(dev, "ID_MODEL");
-    if (prop)
-      strcpy(model, prop);
+    if (prop) {
+      strncpy(model, prop, sizeof(model));
+      model[sizeof(model)-1] = '\0';
+    }
     prop = udev_device_get_property_value(dev, "ID_REVISION");
-    if (prop)
-      strcpy(revision, prop);
+    if (prop) {
+      strncpy(revision, prop, sizeof(revision));
+      revision[sizeof(revision)-1] = '\0';
+    }
     prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
-    if (prop)
-      strcpy(serial, prop);
+    if (prop) {
+      strncpy(serial, prop, sizeof(serial));
+      serial[sizeof(serial)-1] = '\0';
+    }
     prop = udev_device_get_property_value(dev, "ID_TYPE");
-    if (prop)
-      strcpy(blocktype, prop);
+    if (prop) {
+      strncpy(blocktype, prop, sizeof(blocktype));
+      blocktype[sizeof(blocktype)-1] = '\0';
+    }
 
     udev_device_unref(dev);
   } else
@@ -4388,29 +5743,35 @@ hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
 #endif
  {
   snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (!fd)
-    return;
+  file = hwloc_fopen(path, "r", root_fd);
+  if (!file)
+    goto done;
 
-  while (NULL != fgets(line, sizeof(line), fd)) {
+  while (NULL != fgets(line, sizeof(line), file)) {
     tmp = strchr(line, '\n');
     if (tmp)
       *tmp = '\0';
     if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
-      strcpy(vendor, line+strlen("E:ID_VENDOR="));
+      strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
+      vendor[sizeof(vendor)-1] = '\0';
     } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
-      strcpy(model, line+strlen("E:ID_MODEL="));
+      strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
+      model[sizeof(model)-1] = '\0';
     } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
-      strcpy(revision, line+strlen("E:ID_REVISION="));
+      strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
+      revision[sizeof(revision)-1] = '\0';
     } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
-      strcpy(serial, line+strlen("E:ID_SERIAL_SHORT="));
+      strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
+      serial[sizeof(serial)-1] = '\0';
     } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
-      strcpy(blocktype, line+strlen("E:ID_TYPE="));
+      strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
+      blocktype[sizeof(blocktype)-1] = '\0';
     }
   }
-  fclose(fd);
+  fclose(file);
  }
 
+ done:
   /* clear fake "ATA" vendor name */
   if (!strcasecmp(vendor, "ATA"))
     *vendor = '\0';
@@ -4437,401 +5798,1007 @@ hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
   if (*serial)
     hwloc_obj_add_info(obj, "SerialNumber", serial);
 
-  if (!strcmp(blocktype, "disk"))
-    hwloc_obj_add_info(obj, "Type", "Disk");
-  else if (!strcmp(blocktype, "tape"))
-    hwloc_obj_add_info(obj, "Type", "Tape");
-  else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
-    hwloc_obj_add_info(obj, "Type", "Removable Media Device");
-  else /* generic, usb mass storage/rbc, usb mass storage/scsi */
-    hwloc_obj_add_info(obj, "Type", "Other");
-}
-
-/* block class objects are in
- * host%d/target%d:%d:%d/%d:%d:%d:%d/
- * or
- * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
- * or
- * ide%d/%d.%d/
- * below pci devices */
+  if (!strcmp(blocktype, "disk") || !strncmp(obj->name, "nvme", 4))
+    obj->subtype = strdup("Disk");
+  else if (!strcmp(blocktype, "NVDIMM")) /* FIXME: set by us above, to workaround udev returning "" so far */
+    obj->subtype = strdup("NVDIMM");
+  else if (!strcmp(blocktype, "tape"))
+    obj->subtype = strdup("Tape");
+  else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
+    obj->subtype = strdup("Removable Media Device");
+  else {
+    /* generic, usb mass storage/rbc, usb mass storage/scsi */
+  }
+}
+
+static int
+hwloc_linuxfs_lookup_block_class(struct hwloc_backend *backend, unsigned osdev_flags)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/block", root_fd);
+  if (!dir)
+    return 0;
+
+  osdev_flags |= HWLOC_LINUXFS_OSDEV_FLAG_BLOCK_WITH_SECTORS; /* uses 512B sectors */
+
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    struct stat stbuf;
+    hwloc_obj_t obj, parent;
+    int err;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+
+    /* ignore partitions */
+    err = snprintf(path, sizeof(path), "/sys/class/block/%s/partition", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& hwloc_stat(path, &stbuf, root_fd) >= 0)
+      continue;
+
+    err = snprintf(path, sizeof(path), "/sys/class/block/%s", dirent->d_name);
+    if ((size_t) err >= sizeof(path))
+      continue;
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    /* USB device are created here but removed later when USB PCI devices get filtered out
+     * (unless WHOLE_IO is enabled).
+     */
+
+    obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_BLOCK, dirent->d_name);
+
+    hwloc_linuxfs_block_class_fillinfos(backend, root_fd, obj, path, osdev_flags);
+  }
+
+  closedir(dir);
+
+  return 0;
+}
+
+static int
+hwloc_linuxfs_lookup_dax_class(struct hwloc_backend *backend, unsigned osdev_flags)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+
+  /* depending on the kernel config, dax devices may appear either in /sys/bus/dax or /sys/class/dax */
+
+  dir = hwloc_opendir("/sys/bus/dax/devices", root_fd);
+  if (dir) {
+    int found = 0;
+    while ((dirent = readdir(dir)) != NULL) {
+      char path[300];
+      char driver[256];
+      hwloc_obj_t obj, parent;
+      int err;
+
+      if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	continue;
+      found++;
+
+      /* ignore kmem-device, those appear as additional NUMA nodes */
+      err = snprintf(path, sizeof(path), "/sys/bus/dax/devices/%s/driver", dirent->d_name);
+      if ((size_t) err >= sizeof(path))
+	continue;
+      err = hwloc_readlink(path, driver, sizeof(driver), root_fd);
+      if (err >= 0) {
+	driver[err] = '\0';
+	if (!strcmp(driver+err-5, "/kmem"))
+	  continue;
+      }
+
+      snprintf(path, sizeof(path), "/sys/bus/dax/devices/%s", dirent->d_name);
+      parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags | HWLOC_LINUXFS_OSDEV_FLAG_UNDER_BUS);
+      if (!parent)
+	continue;
+
+      obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_BLOCK, dirent->d_name);
+
+      hwloc_linuxfs_block_class_fillinfos(backend, root_fd, obj, path, osdev_flags | HWLOC_LINUXFS_OSDEV_FLAG_UNDER_BUS);
+    }
+    closedir(dir);
+
+    /* don't look in /sys/class/dax if we found something in /sys/bus/dax */
+    if (found)
+      return 0;
+  }
+
+  dir = hwloc_opendir("/sys/class/dax", root_fd);
+  if (dir) {
+    while ((dirent = readdir(dir)) != NULL) {
+      char path[256];
+      hwloc_obj_t obj, parent;
+      int err;
+
+      if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	continue;
+
+      /* kmem not supported in class mode, driver may only be changed under bus */
+
+      err = snprintf(path, sizeof(path), "/sys/class/dax/%s", dirent->d_name);
+      if ((size_t) err >= sizeof(path))
+	continue;
+      parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+      if (!parent)
+	continue;
+
+      obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_BLOCK, dirent->d_name);
+
+      hwloc_linuxfs_block_class_fillinfos(backend, root_fd, obj, path, osdev_flags);
+    }
+    closedir(dir);
+  }
+
+  return 0;
+}
+
+static void
+hwloc_linuxfs_net_class_fillinfos(int root_fd,
+				  struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct stat st;
+  char path[296]; /* osdevpath <= 256 */
+  char address[128];
+  int err;
+  snprintf(path, sizeof(path), "%s/address", osdevpath);
+  if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
+    char *eol = strchr(address, '\n');
+    if (eol)
+      *eol = 0;
+    hwloc_obj_add_info(obj, "Address", address);
+  }
+  snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
+  if (!hwloc_stat(path, &st, root_fd)) {
+    char hexid[16];
+    snprintf(path, sizeof(path), "%s/dev_port", osdevpath);
+    err = hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd);
+    if (err < 0) {
+      /* fallback t dev_id for old kernels/drivers */
+      snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
+      err = hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd);
+    }
+    if (!err) {
+      char *eoid;
+      unsigned long port;
+      port = strtoul(hexid, &eoid, 0);
+      if (eoid != hexid) {
+	char portstr[21];
+	snprintf(portstr, sizeof(portstr), "%lu", port+1);
+	hwloc_obj_add_info(obj, "Port", portstr);
+      }
+    }
+  }
+}
+
+static int
+hwloc_linuxfs_lookup_net_class(struct hwloc_backend *backend, unsigned osdev_flags)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/net", root_fd);
+  if (!dir)
+    return 0;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    hwloc_obj_t obj, parent;
+    int err;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+
+    err = snprintf(path, sizeof(path), "/sys/class/net/%s", dirent->d_name);
+    if ((size_t) err >= sizeof(path))
+      continue;
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_NETWORK, dirent->d_name);
+
+    hwloc_linuxfs_net_class_fillinfos(root_fd, obj, path);
+  }
+
+  closedir(dir);
+
+  return 0;
+}
+
+static void
+hwloc_linuxfs_infiniband_class_fillinfos(int root_fd,
+					 struct hwloc_obj *obj, const char *osdevpath)
+{
+  char path[296]; /* osdevpath <= 256 */
+  char guidvalue[20];
+  unsigned i,j;
+
+  snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
+  if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
+    size_t len;
+    len = strspn(guidvalue, "0123456789abcdefx:");
+    guidvalue[len] = '\0';
+    hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
+  }
+
+  snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
+  if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
+    size_t len;
+    len = strspn(guidvalue, "0123456789abcdefx:");
+    guidvalue[len] = '\0';
+    hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
+  }
+
+  for(i=1; ; i++) {
+    char statevalue[2];
+    char lidvalue[11];
+    char gidvalue[40];
+
+    snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
+    if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
+      char statename[32];
+      statevalue[1] = '\0'; /* only keep the first byte/digit */
+      snprintf(statename, sizeof(statename), "Port%uState", i);
+      hwloc_obj_add_info(obj, statename, statevalue);
+    } else {
+      /* no such port */
+      break;
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
+    if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
+      char lidname[32];
+      size_t len;
+      len = strspn(lidvalue, "0123456789abcdefx");
+      lidvalue[len] = '\0';
+      snprintf(lidname, sizeof(lidname), "Port%uLID", i);
+      hwloc_obj_add_info(obj, lidname, lidvalue);
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
+    if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
+      char lidname[32];
+      size_t len;
+      len = strspn(lidvalue, "0123456789");
+      lidvalue[len] = '\0';
+      snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
+      hwloc_obj_add_info(obj, lidname, lidvalue);
+    }
+
+    for(j=0; ; j++) {
+      snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
+      if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
+	char gidname[32];
+	size_t len;
+	len = strspn(gidvalue, "0123456789abcdefx:");
+	gidvalue[len] = '\0';
+	if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
+	  /* only keep initialized GIDs */
+	  snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
+	  hwloc_obj_add_info(obj, gidname, gidvalue);
+	}
+      } else {
+	/* no such port */
+	break;
+      }
+    }
+  }
+}
+
+static int
+hwloc_linuxfs_lookup_infiniband_class(struct hwloc_backend *backend, unsigned osdev_flags)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/infiniband", root_fd);
+  if (!dir)
+    return 0;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    hwloc_obj_t obj, parent;
+    int err;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+
+    /* blocklist scif* fake devices */
+    if (!strncmp(dirent->d_name, "scif", 4))
+      continue;
+
+    err = snprintf(path, sizeof(path), "/sys/class/infiniband/%s", dirent->d_name);
+    if ((size_t) err > sizeof(path))
+      continue;
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_OPENFABRICS, dirent->d_name);
+
+    hwloc_linuxfs_infiniband_class_fillinfos(root_fd, obj, path);
+  }
+
+  closedir(dir);
+
+  return 0;
+}
+
+static void
+hwloc_linuxfs_mic_class_fillinfos(int root_fd,
+				  struct hwloc_obj *obj, const char *osdevpath)
+{
+  char path[296]; /* osdevpath <= 256 */
+  char family[64];
+  char sku[64];
+  char sn[64];
+  char string[21];
+
+  obj->subtype = strdup("MIC");
+
+  snprintf(path, sizeof(path), "%s/family", osdevpath);
+  if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
+    char *eol = strchr(family, '\n');
+    if (eol)
+      *eol = 0;
+    hwloc_obj_add_info(obj, "MICFamily", family);
+  }
+
+  snprintf(path, sizeof(path), "%s/sku", osdevpath);
+  if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
+    char *eol = strchr(sku, '\n');
+    if (eol)
+      *eol = 0;
+    hwloc_obj_add_info(obj, "MICSKU", sku);
+  }
+
+  snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
+  if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
+    char *eol;
+    eol = strchr(sn, '\n');
+    if (eol)
+      *eol = 0;
+    hwloc_obj_add_info(obj, "MICSerialNumber", sn);
+  }
+
+  snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
+  if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
+    unsigned long count = strtoul(string, NULL, 16);
+    snprintf(string, sizeof(string), "%lu", count);
+    hwloc_obj_add_info(obj, "MICActiveCores", string);
+  }
+
+  snprintf(path, sizeof(path), "%s/memsize", osdevpath);
+  if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
+    unsigned long count = strtoul(string, NULL, 16);
+    snprintf(string, sizeof(string), "%lu", count);
+    hwloc_obj_add_info(obj, "MICMemorySize", string);
+  }
+}
+
+static int
+hwloc_linuxfs_lookup_mic_class(struct hwloc_backend *backend, unsigned osdev_flags)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  unsigned idx;
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/mic", root_fd);
+  if (!dir)
+    return 0;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    hwloc_obj_t obj, parent;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+    if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
+      continue;
+
+    snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_COPROC, dirent->d_name);
+
+    hwloc_linuxfs_mic_class_fillinfos(root_fd, obj, path);
+  }
+
+  closedir(dir);
+
+  return 0;
+}
+
 static int
-hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
-				    struct hwloc_obj *pcidev, char *path, size_t pathlen)
+hwloc_linuxfs_lookup_drm_class(struct hwloc_backend *backend, unsigned osdev_flags)
 {
   struct hwloc_linux_backend_data_s *data = backend->private_data;
   int root_fd = data->root_fd;
-  DIR *hostdir, *portdir, *targetdir;
-  struct dirent *hostdirent, *portdirent, *targetdirent;
-  size_t hostdlen, portdlen, targetdlen;
-  int dummy;
-  int res = 0;
-
-  hostdir = hwloc_opendir(path, root_fd);
-  if (!hostdir)
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/drm", root_fd);
+  if (!dir)
     return 0;
 
-  while ((hostdirent = readdir(hostdir)) != NULL) {
-    if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
-    {
-      /* found host%d/port-%d:%d */
-      path[pathlen] = '/';
-      strcpy(&path[pathlen+1], hostdirent->d_name);
-      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
-      portdir = hwloc_opendir(path, root_fd);
-      if (!portdir)
-	continue;
-      while ((portdirent = readdir(portdir)) != NULL) {
-	if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
-	  /* found host%d/port-%d:%d/end_device-%d:%d */
-	  path[pathlen] = '/';
-	  strcpy(&path[pathlen+1], portdirent->d_name);
-	  pathlen += portdlen = 1+strlen(portdirent->d_name);
-	  res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
-	  /* restore parent path */
-	  pathlen -= portdlen;
-	  path[pathlen] = '\0';
-	}
-      }
-      closedir(portdir);
-      /* restore parent path */
-      pathlen -= hostdlen;
-      path[pathlen] = '\0';
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    hwloc_obj_t parent;
+    struct stat stbuf;
+    int err;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
       continue;
-    } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
-      /* found host%d/target%d:%d:%d */
-      path[pathlen] = '/';
-      strcpy(&path[pathlen+1], hostdirent->d_name);
-      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
-      targetdir = hwloc_opendir(path, root_fd);
-      if (!targetdir)
-	continue;
-      while ((targetdirent = readdir(targetdir)) != NULL) {
-	if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
-	  continue;
-	/* found host%d/target%d:%d:%d/%d:%d:%d:%d */
-	path[pathlen] = '/';
-	strcpy(&path[pathlen+1], targetdirent->d_name);
-	pathlen += targetdlen = 1+strlen(targetdirent->d_name);
-	/* lookup block class for real */
-	res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
-	/* restore parent path */
-	pathlen -= targetdlen;
-	path[pathlen] = '\0';
-      }
-      closedir(targetdir);
-      /* restore parent path */
-      pathlen -= hostdlen;
-      path[pathlen] = '\0';
-    }
+
+    /* only keep main devices, not subdevices for outputs */
+    err = snprintf(path, sizeof(path), "/sys/class/drm/%s/dev", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& hwloc_stat(path, &stbuf, root_fd) < 0)
+      continue;
+
+    /* Most drivers expose a card%d device.
+     * Some (free?) drivers also expose render%d.
+     * Old kernels also have a controlD%d. On recent kernels, it's a symlink to card%d (deprecated?).
+     * There can also exist some output-specific files such as card0-DP-1.
+     *
+     * All these aren't very useful compared to CUDA/OpenCL/...
+     * Hence the DRM class is only enabled when KEEP_ALL.
+     *
+     * FIXME: We might want to filter everything out but card%d.
+     * Maybe look at the driver (read the end of /sys/class/drm/<name>/device/driver symlink),
+     * to decide whether card%d could be useful (likely not for NVIDIA).
+     */
+
+    err = snprintf(path, sizeof(path), "/sys/class/drm/%s", dirent->d_name);
+    if ((size_t) err >= sizeof(path))
+      continue;
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_GPU, dirent->d_name);
   }
-  closedir(hostdir);
 
-  return res;
+  closedir(dir);
+
+  return 0;
 }
 
 static int
-hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
-			       struct hwloc_obj *pcidev, const char *pcidevpath)
+hwloc_linuxfs_lookup_dma_class(struct hwloc_backend *backend, unsigned osdev_flags)
 {
   struct hwloc_linux_backend_data_s *data = backend->private_data;
   int root_fd = data->root_fd;
-  size_t pathlen;
-  DIR *devicedir, *hostdir;
-  struct dirent *devicedirent, *hostdirent;
-  size_t devicedlen, hostdlen;
-  char path[256];
-  int dummy;
-  int res = 0;
+  DIR *dir;
+  struct dirent *dirent;
+
+  dir = hwloc_opendir("/sys/class/dma", root_fd);
+  if (!dir)
+    return 0;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    char path[256];
+    hwloc_obj_t parent;
+    int err;
+
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+
+    err = snprintf(path, sizeof(path), "/sys/class/dma/%s", dirent->d_name);
+    if ((size_t) err >= sizeof(path))
+      continue;
+    parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
+    if (!parent)
+      continue;
+
+    hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_DMA, dirent->d_name);
+  }
 
-  strcpy(path, pcidevpath);
-  pathlen = strlen(path);
+  closedir(dir);
+
+  return 0;
+}
+
+struct hwloc_firmware_dmi_mem_device_header {
+  unsigned char type;
+  unsigned char length;
+  unsigned char handle[2];
+  unsigned char phy_mem_handle[2];
+  unsigned char mem_err_handle[2];
+  unsigned char tot_width[2];
+  unsigned char dat_width[2];
+  unsigned char size[2];
+  unsigned char ff;
+  unsigned char dev_set;
+  unsigned char dev_loc_str_num;
+  unsigned char bank_loc_str_num;
+  unsigned char mem_type;
+  unsigned char type_detail[2];
+  unsigned char speed[2];
+  unsigned char manuf_str_num;
+  unsigned char serial_str_num;
+  unsigned char asset_tag_str_num;
+  unsigned char part_num_str_num;
+  /* don't include the following fields since we don't need them,
+   * some old implementations may miss them.
+   */
+};
 
-  devicedir = hwloc_opendir(pcidevpath, root_fd);
-  if (!devicedir)
+static int check_dmi_entry(const char *buffer)
+{
+  /* reject empty strings */
+  if (!*buffer)
+    return 0;
+  /* reject strings of spaces (at least Dell use this for empty memory slots) */
+  if (strspn(buffer, " ") == strlen(buffer))
     return 0;
+  return 1;
+}
 
-  while ((devicedirent = readdir(devicedir)) != NULL) {
-    if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
-      /* found ide%d */
-      path[pathlen] = '/';
-      strcpy(&path[pathlen+1], devicedirent->d_name);
-      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
-      hostdir = hwloc_opendir(path, root_fd);
-      if (!hostdir)
-	continue;
-      while ((hostdirent = readdir(hostdir)) != NULL) {
-	if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
-	  /* found ide%d/%d.%d */
-	  path[pathlen] = '/';
-	  strcpy(&path[pathlen+1], hostdirent->d_name);
-	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
-	  /* lookup block class for real */
-	  res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
-	  /* restore parent path */
-	  pathlen -= hostdlen;
-	  path[pathlen] = '\0';
+static int
+hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
+					unsigned idx, const char *path, FILE *fd,
+					struct hwloc_firmware_dmi_mem_device_header *header)
+{
+  unsigned slen;
+  char buffer[256]; /* enough for memory device strings, or at least for each of them */
+  unsigned foff; /* offset in raw file */
+  unsigned boff; /* offset in buffer read from raw file */
+  unsigned i;
+  struct hwloc_info_s *infos = NULL;
+  unsigned infos_count = 0;
+  hwloc_obj_t misc;
+  int foundinfo = 0;
+
+  /* start after the header */
+  foff = header->length;
+  i = 1;
+  while (1) {
+    /* read one buffer */
+    if (fseek(fd, foff, SEEK_SET) < 0)
+      break;
+    if (!fgets(buffer, sizeof(buffer), fd))
+      break;
+    /* read string at the beginning of the buffer */
+    boff = 0;
+    while (1) {
+      /* stop on empty string */
+      if (!buffer[boff])
+        goto done;
+      /* stop if this string goes to the end of the buffer */
+      slen = strlen(buffer+boff);
+      if (boff + slen+1 == sizeof(buffer))
+        break;
+      /* string didn't get truncated, should be OK */
+      if (i == header->manuf_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
+	  foundinfo = 1;
 	}
-      }
-      closedir(hostdir);
-      /* restore parent path */
-      pathlen -= devicedlen;
-      path[pathlen] = '\0';
-    } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
-      /* found host%d */
-      path[pathlen] = '/';
-      strcpy(&path[pathlen+1], devicedirent->d_name);
-      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
-      res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
-      /* restore parent path */
-      pathlen -= devicedlen;
-      path[pathlen] = '\0';
-    } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
-      /* found ata%d */
-      path[pathlen] = '/';
-      strcpy(&path[pathlen+1], devicedirent->d_name);
-      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
-      hostdir = hwloc_opendir(path, root_fd);
-      if (!hostdir)
-	continue;
-      while ((hostdirent = readdir(hostdir)) != NULL) {
-	if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
-	  /* found ata%d/host%d */
-	  path[pathlen] = '/';
-	  strcpy(&path[pathlen+1], hostdirent->d_name);
-	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
-	  /* lookup block class for real */
-          res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
-	  /* restore parent path */
-	  pathlen -= hostdlen;
-	  path[pathlen] = '\0';
+      }	else if (i == header->serial_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->asset_tag_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->part_num_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->dev_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
+	}
+      } else if (i == header->bank_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
 	}
+      } else {
+	goto done;
       }
-      closedir(hostdir);
-      /* restore parent path */
-      pathlen -= devicedlen;
-      path[pathlen] = '\0';
+      /* next string in buffer */
+      boff += slen+1;
+      i++;
+    }
+    /* couldn't read a single full string from that buffer, we're screwed */
+    if (!boff) {
+      fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
+	      i, path);
+      break;
     }
+    /* reread buffer after previous string */
+    foff += boff;
+  }
+
+done:
+  if (!foundinfo) {
+    /* found no actual info about the device. if there's only location info, the slot may be empty */
+    goto out_with_infos;
   }
-  closedir(devicedir);
 
-  return res;
+  misc = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MISC, idx);
+  if (!misc)
+    goto out_with_infos;
+
+  misc->subtype = strdup("MemoryModule");
+
+  hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
+  /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
+   * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
+   * with the vendor, and it's hard to be 100% sure 'B' is second socket.
+   * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
+   * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
+   */
+  hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
+  return 1;
+
+ out_with_infos:
+  hwloc__free_infos(infos, infos_count);
+  return 0;
 }
 
-static void
-hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
-				struct hwloc_obj *obj, const char *osdevpath)
+static int
+hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
+				    struct hwloc_linux_backend_data_s *data)
+{
+  char path[128];
+  unsigned i;
+
+  for(i=0; ; i++) {
+    FILE *fd;
+    struct hwloc_firmware_dmi_mem_device_header header;
+    int err;
+
+    snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
+    fd = hwloc_fopen(path, "r", data->root_fd);
+    if (!fd)
+      break;
+
+    err = fread(&header, sizeof(header), 1, fd);
+    if (err != 1) {
+      fclose(fd);
+      break;
+    }
+    if (header.length < sizeof(header)) {
+      /* invalid, or too old entry/spec that doesn't contain what we need */
+      fclose(fd);
+      break;
+    }
+
+    hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
+
+    fclose(fd);
+  }
+
+  return 0;
+}
+
+#ifdef HWLOC_HAVE_LINUXPCI
+
+#define HWLOC_PCI_REVISION_ID 0x08
+#define HWLOC_PCI_CAP_ID_EXP 0x10
+#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
+
+static int
+hwloc_linuxfs_pci_look_pcidevices(struct hwloc_backend *backend)
 {
   struct hwloc_linux_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  hwloc_obj_t tree = NULL;
   int root_fd = data->root_fd;
-  FILE *fd;
-  char path[256];
+  DIR *dir;
+  struct dirent *dirent;
+
+  /* We could lookup /sys/devices/pci.../.../busid1/.../busid2 recursively
+   * to build the hierarchy of bridges/devices directly.
+   * But that would require readdirs in all bridge sysfs subdirectories.
+   * Do a single readdir in the linear list in /sys/bus/pci/devices/...
+   * and build the hierarchy manually instead.
+   */
+  dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
+  if (!dir)
+    return 0;
+
+  while ((dirent = readdir(dir)) != NULL) {
+#define CONFIG_SPACE_CACHESIZE 256
+    unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
+    unsigned domain, bus, dev, func;
+    unsigned secondary_bus, subordinate_bus;
+    unsigned short class_id;
+    hwloc_obj_type_t type;
+    hwloc_obj_t obj;
+    struct hwloc_pcidev_attr_s *attr;
+    unsigned offset;
+    char path[64];
+    char value[16];
+    size_t ret;
+    int fd, err;
+
+    if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
+      continue;
+
+    if (domain > 0xffff) {
+      static int warned = 0;
+      if (!warned)
+	fprintf(stderr, "Ignoring PCI device with non-16bit domain\n");
+      warned = 1;
+      continue;
+    }
+
+    /* initialize the config space in case we fail to read it (missing permissions, etc). */
+    memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
+    if ((size_t) err < sizeof(path)) {
+      /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
+      fd = hwloc_open(path, root_fd);
+      if (fd >= 0) {
+	ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
+	(void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
+	close(fd);
+      }
+    }
 
-  hwloc_obj_add_info(obj, "CoProcType", "MIC");
+    class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+      class_id = strtoul(value, NULL, 16) >> 8;
 
-  snprintf(path, sizeof(path), "%s/family", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char family[64];
-    if (fgets(family, sizeof(family), fd)) {
-      char *eol = strchr(family, '\n');
-      if (eol)
-        *eol = 0;
-      hwloc_obj_add_info(obj, "MICFamily", family);
-    }
-    fclose(fd);
-  }
+    type = hwloc_pcidisc_check_bridge_type(class_id, config_space_cache);
 
-  snprintf(path, sizeof(path), "%s/sku", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char sku[64];
-    if (fgets(sku, sizeof(sku), fd)) {
-      char *eol = strchr(sku, '\n');
-      if (eol)
-        *eol = 0;
-      hwloc_obj_add_info(obj, "MICSKU", sku);
+    if (type == HWLOC_OBJ_BRIDGE) {
+      /* since 4.13, there's secondary_bus_number and subordinate_bus_number in sysfs,
+       * but reading them from the config-space is easy anyway.
+       */
+      if (hwloc_pcidisc_find_bridge_buses(domain, bus, dev, func,
+					  &secondary_bus, &subordinate_bus,
+					  config_space_cache) < 0)
+	continue;
     }
-    fclose(fd);
-  }
 
-  snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char sn[64];
-    if (fgets(sn, sizeof(sn), fd)) {
-      char *eol = strchr(sn, '\n');
-      if (eol)
-        *eol = 0;
-      hwloc_obj_add_info(obj, "MICSerialNumber", sn);
+    /* filtered? */
+    if (type == HWLOC_OBJ_PCI_DEVICE) {
+      enum hwloc_type_filter_e filter;
+      hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &filter);
+      if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
+	continue;
+      if (filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT
+	  && !hwloc_filter_check_pcidev_subtype_important(class_id))
+	continue;
+    } else if (type == HWLOC_OBJ_BRIDGE) {
+      enum hwloc_type_filter_e filter;
+      hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &filter);
+      if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
+	continue;
+      /* HWLOC_TYPE_FILTER_KEEP_IMPORTANT filtered later in the core */
     }
-    fclose(fd);
-  }
 
-  snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char string[10];
-    if (fgets(string, sizeof(string), fd)) {
-      unsigned long count = strtoul(string, NULL, 16);
-      snprintf(string, sizeof(string), "%lu", count);
-      hwloc_obj_add_info(obj, "MICActiveCores", string);
+    obj = hwloc_alloc_setup_object(topology, type, HWLOC_UNKNOWN_INDEX);
+    if (!obj)
+      break;
+    attr = &obj->attr->pcidev;
+
+    attr->domain = domain;
+    attr->bus = bus;
+    attr->dev = dev;
+    attr->func = func;
+
+    /* bridge specific attributes */
+    if (type == HWLOC_OBJ_BRIDGE) {
+      struct hwloc_bridge_attr_s *battr = &obj->attr->bridge;
+      battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
+      battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+      battr->downstream.pci.domain = domain;
+      battr->downstream.pci.secondary_bus = secondary_bus;
+      battr->downstream.pci.subordinate_bus = subordinate_bus;
     }
-    fclose(fd);
-  }
 
-  snprintf(path, sizeof(path), "%s/memsize", osdevpath);
-  fd = hwloc_fopen(path, "r", root_fd);
-  if (fd) {
-    char string[20];
-    if (fgets(string, sizeof(string), fd)) {
-      unsigned long count = strtoul(string, NULL, 16);
-      snprintf(string, sizeof(string), "%lu", count);
-      hwloc_obj_add_info(obj, "MICMemorySize", string);
+    /* default (unknown) values */
+    attr->vendor_id = 0;
+    attr->device_id = 0;
+    attr->class_id = class_id;
+    attr->revision = 0;
+    attr->subvendor_id = 0;
+    attr->subdevice_id = 0;
+    attr->linkspeed = 0;
+
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+      attr->vendor_id = strtoul(value, NULL, 16);
+
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+      attr->device_id = strtoul(value, NULL, 16);
+
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+      attr->subvendor_id = strtoul(value, NULL, 16);
+
+    err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
+    if ((size_t) err < sizeof(path)
+	&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+      attr->subdevice_id = strtoul(value, NULL, 16);
+
+    /* get the revision */
+    attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
+
+    /* try to get the link speed */
+    offset = hwloc_pcidisc_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
+    if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE) {
+      hwloc_pcidisc_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
+    } else {
+      /* if not available from config-space (extended part is root-only), look in sysfs files added in 4.13 */
+      float speed = 0.f;
+      unsigned width = 0;
+      err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_speed", dirent->d_name);
+      if ((size_t) err < sizeof(path)
+	  && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+	speed = hwloc_linux_pci_link_speed_from_string(value);
+      err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_width", dirent->d_name);
+      if ((size_t) err < sizeof(path)
+	  && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
+	width = atoi(value);
+      attr->linkspeed = speed*width/8;
     }
-    fclose(fd);
+
+    hwloc_pcidisc_tree_insert_by_busid(&tree, obj);
   }
-}
 
-static int
-hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
-			     struct hwloc_obj *pcidev, const char *pcidevpath)
-{
-  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
+  closedir(dir);
+
+  hwloc_pcidisc_tree_attach(backend->topology, tree);
+  return 0;
 }
 
 static int
-hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
-				   struct hwloc_obj *pcidev)
+hwloc_linuxfs_pci_look_pcislots(struct hwloc_backend *backend)
 {
+  struct hwloc_topology *topology = backend->topology;
   struct hwloc_linux_backend_data_s *data = backend->private_data;
   int root_fd = data->root_fd;
-  char path[256];
-  struct stat st;
-  hwloc_obj_t obj;
-  unsigned idx;
-  int res = 0;
-
-  if (!data->mic_directlookup_id_max)
-    /* already tried, nothing to do */
-    return 0;
-
-  if (data->mic_directlookup_id_max == (unsigned) -1) {
-    /* never tried, find out the max id */
-    DIR *dir;
-    struct dirent *dirent;
-
-    /* make sure we never do this lookup again */
-    data->mic_directlookup_id_max = 0;
+  DIR *dir;
+  struct dirent *dirent;
 
-    /* read the entire class and find the max id of mic%u dirents */
-    dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
-    if (!dir) {
-      dir = opendir("/sys/class/mic");
-      if (!dir)
-	return 0;
-    }
+  dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
+  if (dir) {
     while ((dirent = readdir(dir)) != NULL) {
-      if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
-	continue;
-      if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
+      char path[64];
+      char buf[64];
+      unsigned domain, bus, dev;
+      int err;
+
+      if (dirent->d_name[0] == '.')
 	continue;
-      if (idx >= data->mic_directlookup_id_max)
-	data->mic_directlookup_id_max = idx+1;
+      err = snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
+      if ((size_t) err < sizeof(path)
+	  && !hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
+	  && sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+	/* may also be %x:%x without a device number but that's only for hotplug when nothing is plugged, ignore those */
+	hwloc_obj_t obj = hwloc_pci_find_by_busid(topology, domain, bus, dev, 0);
+	/* obj may be higher in the hierarchy that requested (if that exact bus didn't exist),
+	 * we'll check below whether the bus ID is correct.
+	 */
+	while (obj) {
+	  /* Apply the slot to that device and its siblings with same domain/bus/dev ID.
+	   * Make sure that siblings are still PCI and on the same bus
+	   * (optional bridge filtering can put different things together).
+	   */
+	  if (obj->type != HWLOC_OBJ_PCI_DEVICE &&
+	      (obj->type != HWLOC_OBJ_BRIDGE || obj->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI))
+	    break;
+	  if (obj->attr->pcidev.domain != domain
+	      || obj->attr->pcidev.bus != bus
+	      || obj->attr->pcidev.dev != dev)
+	    break;
+
+	  hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
+	  obj = obj->next_sibling;
+	}
+      }
     }
     closedir(dir);
   }
 
-  /* now iterate over the mic ids and see if one matches our pcidev */
-  for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
-    snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
-	     idx, pcidev->attr->pcidev.bus,  pcidev->attr->pcidev.dev,  pcidev->attr->pcidev.func);
-    if (hwloc_stat(path, &st, root_fd) < 0)
-      continue;
-    snprintf(path, sizeof(path), "mic%u", idx);
-    obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
-    snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
-    hwloc_linux_mic_class_fillinfos(backend, obj, path);
-    res++;
-  }
-
-  return res;
+  return 0;
 }
+#endif /* HWLOC_HAVE_LINUXPCI */
+#endif /* HWLOC_HAVE_LINUXIO */
 
-/*
- * backend callback for inserting objects inside a pci device
- */
 static int
-hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
-				      struct hwloc_obj *obj)
+hwloc_look_linuxfs(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
 {
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  char pcidevpath[256];
-  int res = 0;
-
-  /* this callback is only used in the libpci backend for now */
-  assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
-
-  snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
-	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
-	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
-
-  res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
-  res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
-  res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
-  res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
-  res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
-
-  if (data->mic_need_directlookup == -1) {
-    struct stat st;
-    if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
-	&& hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
-      /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
-       * do not have mic/mic%u symlinks to mic devices (old mic driver).
-       * if so, try from the mic class.
-       */
-      data->mic_need_directlookup = 1;
-    else
-      data->mic_need_directlookup = 0;
+  /*
+   * This backend may be used with topology->is_thissystem set (default)
+   * or not (modified fsroot path).
+   */
+
+  struct hwloc_topology *topology = backend->topology;
+#ifdef HWLOC_HAVE_LINUXIO
+  enum hwloc_type_filter_e pfilter, bfilter, ofilter, mfilter;
+#endif /* HWLOC_HAVE_LINUXIO */
+
+  if (dstatus->phase == HWLOC_DISC_PHASE_CPU) {
+    hwloc_linuxfs_look_cpu(backend, dstatus);
+    return 0;
   }
-  if (data->mic_need_directlookup)
-    res += hwloc_linux_directlookup_mic_class(backend, obj);
-  else
-    res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
 
-  return res;
-}
+#ifdef HWLOC_HAVE_LINUXIO
+  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &pfilter);
+  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &bfilter);
+  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &ofilter);
+  hwloc_topology_get_type_filter(topology, HWLOC_OBJ_MISC, &mfilter);
 
-/*
- * backend callback for retrieving the location of a pci device
- */
-static int
-hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
-				   struct hwloc_backend *caller __hwloc_attribute_unused,
-				   struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
-{
-  struct hwloc_linux_backend_data_s *data = backend->private_data;
-  char path[256];
-  FILE *file;
-  int err;
+  if (dstatus->phase == HWLOC_DISC_PHASE_PCI
+      && (bfilter != HWLOC_TYPE_FILTER_KEEP_NONE
+	  || pfilter != HWLOC_TYPE_FILTER_KEEP_NONE)) {
+#ifdef HWLOC_HAVE_LINUXPCI
+    hwloc_linuxfs_pci_look_pcidevices(backend);
+    /* no need to run another PCI phase */
+    dstatus->excluded_phases |= HWLOC_DISC_PHASE_PCI;
+#endif /* HWLOC_HAVE_LINUXPCI */
+  }
 
-  /* this callback is only used in the libpci backend for now */
-  assert(obj->type == HWLOC_OBJ_PCI_DEVICE
-	 || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
+  if (dstatus->phase == HWLOC_DISC_PHASE_ANNOTATE
+      && (bfilter != HWLOC_TYPE_FILTER_KEEP_NONE
+	  || pfilter != HWLOC_TYPE_FILTER_KEEP_NONE)) {
+#ifdef HWLOC_HAVE_LINUXPCI
+    hwloc_linuxfs_pci_look_pcislots(backend);
+#endif /* HWLOC_HAVE_LINUXPCI */
+  }
 
-  snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
-	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
-	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
-  file = hwloc_fopen(path, "r", data->root_fd);
-  if (file) {
-    err = hwloc_linux_parse_cpumap_file(file, cpuset);
-    fclose(file);
-    if (!err && !hwloc_bitmap_iszero(cpuset))
-      return 0;
+  if (dstatus->phase == HWLOC_DISC_PHASE_IO
+      && ofilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
+    unsigned osdev_flags = 0;
+    if (getenv("HWLOC_VIRTUAL_LINUX_OSDEV"))
+      osdev_flags |= HWLOC_LINUXFS_OSDEV_FLAG_FIND_VIRTUAL;
+    if (ofilter == HWLOC_TYPE_FILTER_KEEP_ALL)
+      osdev_flags |= HWLOC_LINUXFS_OSDEV_FLAG_FIND_USB;
+
+    hwloc_linuxfs_lookup_block_class(backend, osdev_flags);
+    hwloc_linuxfs_lookup_dax_class(backend, osdev_flags);
+    hwloc_linuxfs_lookup_net_class(backend, osdev_flags);
+    hwloc_linuxfs_lookup_infiniband_class(backend, osdev_flags);
+    hwloc_linuxfs_lookup_mic_class(backend, osdev_flags);
+    if (ofilter != HWLOC_TYPE_FILTER_KEEP_IMPORTANT) {
+      hwloc_linuxfs_lookup_drm_class(backend, osdev_flags);
+      hwloc_linuxfs_lookup_dma_class(backend, osdev_flags);
+    }
   }
-  return -1;
-}
 
+  if (dstatus->phase == HWLOC_DISC_PHASE_MISC
+      && mfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
+    hwloc__get_firmware_dmi_memory_info(topology, backend->private_data);
+  }
+#endif /* HWLOC_HAVE_LINUXIO */
 
+  return 0;
+}
 
 /*******************************
  ******* Linux component *******
@@ -4842,9 +6809,12 @@ hwloc_linux_backend_disable(struct hwloc_backend *backend)
 {
   struct hwloc_linux_backend_data_s *data = backend->private_data;
 #ifdef HAVE_OPENAT
-  close(data->root_fd);
+  if (data->root_fd >= 0) {
+    free(data->root_path);
+    close(data->root_fd);
+  }
 #endif
-#ifdef HAVE_LIBUDEV_H
+#ifdef HWLOC_HAVE_LIBUDEV
   if (data->udev)
     udev_unref(data->udev);
 #endif
@@ -4852,17 +6822,20 @@ hwloc_linux_backend_disable(struct hwloc_backend *backend)
 }
 
 static struct hwloc_backend *
-hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
-				  const void *_data1,
+hwloc_linux_component_instantiate(struct hwloc_topology *topology,
+				  struct hwloc_disc_component *component,
+				  unsigned excluded_phases __hwloc_attribute_unused,
+				  const void *_data1 __hwloc_attribute_unused,
 				  const void *_data2 __hwloc_attribute_unused,
 				  const void *_data3 __hwloc_attribute_unused)
 {
   struct hwloc_backend *backend;
   struct hwloc_linux_backend_data_s *data;
-  const char * fsroot_path = _data1;
-  int flags, root = -1;
+  const char * fsroot_path;
+  int root = -1;
+  char *env;
 
-  backend = hwloc_backend_alloc(component);
+  backend = hwloc_backend_alloc(topology, component);
   if (!backend)
     goto out;
 
@@ -4874,59 +6847,82 @@ hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
 
   backend->private_data = data;
   backend->discover = hwloc_look_linuxfs;
-  backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
-  backend->notify_new_object = hwloc_linux_backend_notify_new_object;
+  backend->get_pci_busid_cpuset = hwloc_linux_backend_get_pci_busid_cpuset;
   backend->disable = hwloc_linux_backend_disable;
 
   /* default values */
-  data->is_amd_with_CU = 0;
+  data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
   data->is_knl = 0;
+  data->is_amd_with_CU = 0;
+  data->use_dt = 0;
   data->is_real_fsroot = 1;
+  data->root_path = NULL;
+  fsroot_path = getenv("HWLOC_FSROOT");
   if (!fsroot_path)
     fsroot_path = "/";
 
+  if (strcmp(fsroot_path, "/")) {
 #ifdef HAVE_OPENAT
-  root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
-  if (root < 0)
-    goto out_with_data;
+    int flags;
+
+    root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
+    if (root < 0)
+      goto out_with_data;
 
-  if (strcmp(fsroot_path, "/")) {
     backend->is_thissystem = 0;
     data->is_real_fsroot = 0;
-  }
-
-  /* Since this fd stays open after hwloc returns, mark it as
-     close-on-exec so that children don't inherit it.  Stevens says
-     that we should GETFD before we SETFD, so we do. */
-  flags = fcntl(root, F_GETFD, 0);
-  if (-1 == flags ||
-      -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
+    data->root_path = strdup(fsroot_path);
+
+    /* Since this fd stays open after hwloc returns, mark it as
+       close-on-exec so that children don't inherit it.  Stevens says
+       that we should GETFD before we SETFD, so we do. */
+    flags = fcntl(root, F_GETFD, 0);
+    if (-1 == flags ||
+	-1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
       close(root);
       root = -1;
       goto out_with_data;
-  }
+    }
 #else
-  if (strcmp(fsroot_path, "/")) {
+    fprintf(stderr, "Cannot change Linux fsroot without openat() support.\n");
     errno = ENOSYS;
     goto out_with_data;
-  }
 #endif
+  }
   data->root_fd = root;
 
-#ifdef HAVE_LIBUDEV_H
+#ifdef HWLOC_HAVE_LIBUDEV
   data->udev = NULL;
   if (data->is_real_fsroot) {
     data->udev = udev_new();
   }
 #endif
 
-  data->deprecated_classlinks_model = -2; /* never tried */
-  data->mic_need_directlookup = -1; /* not initialized */
-  data->mic_directlookup_id_max = -1; /* not initialized */
+  data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
+  if (!data->dumped_hwdata_dirname)
+    data->dumped_hwdata_dirname = (char *) RUNSTATEDIR "/hwloc/";
+
+  data->use_numa_distances = 1;
+  data->use_numa_distances_for_cpuless = 1;
+  data->use_numa_initiators = 1;
+  env = getenv("HWLOC_USE_NUMA_DISTANCES");
+  if (env) {
+    unsigned val = atoi(env);
+    data->use_numa_distances = !!(val & 3); /* 2 implies 1 */
+    data->use_numa_distances_for_cpuless = !!(val & 2);
+    data->use_numa_initiators = !!(val & 4);
+  }
+
+  env = getenv("HWLOC_USE_DT");
+  if (env)
+    data->use_dt = atoi(env);
 
   return backend;
 
  out_with_data:
+#ifdef HAVE_OPENAT
+  free(data->root_path);
+#endif
   free(data);
  out_with_backend:
   free(backend);
@@ -4935,11 +6931,12 @@ hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
 }
 
 static struct hwloc_disc_component hwloc_linux_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
   "linux",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  HWLOC_DISC_PHASE_CPU | HWLOC_DISC_PHASE_PCI | HWLOC_DISC_PHASE_IO | HWLOC_DISC_PHASE_MISC | HWLOC_DISC_PHASE_ANNOTATE,
+  HWLOC_DISC_PHASE_GLOBAL,
   hwloc_linux_component_instantiate,
   50,
+  1,
   NULL
 };
 
@@ -4950,236 +6947,3 @@ const struct hwloc_component hwloc_linux_component = {
   0,
   &hwloc_linux_disc_component
 };
-
-
-
-
-#ifdef HWLOC_HAVE_LINUXPCI
-
-/***********************************
- ******* Linux PCI component *******
- ***********************************/
-
-#define HWLOC_PCI_REVISION_ID 0x08
-#define HWLOC_PCI_CAP_ID_EXP 0x10
-#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
-
-static int
-hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
-{
-  struct hwloc_topology *topology = backend->topology;
-  struct hwloc_backend *tmpbackend;
-  hwloc_obj_t first_obj = NULL, last_obj = NULL;
-  int root_fd = -1;
-  DIR *dir;
-  struct dirent *dirent;
-  int res = 0;
-
-  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
-    return 0;
-
-  if (hwloc_get_next_pcidev(topology, NULL)) {
-    hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
-    return 0;
-  }
-
-  /* hackily find the linux backend to steal its fsroot */
-  tmpbackend = topology->backends;
-  while (tmpbackend) {
-    if (tmpbackend->component == &hwloc_linux_disc_component) {
-      root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
-      hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
-      break;    }
-    tmpbackend = tmpbackend->next;
-  }
-  /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
-  if (root_fd >= 0)
-    root_fd = dup(root_fd);
-  else
-    root_fd = open("/", O_RDONLY | O_DIRECTORY);
-
-  dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
-  if (!dir)
-    goto out_with_rootfd;
-
-  while ((dirent = readdir(dir)) != NULL) {
-    unsigned domain, bus, dev, func;
-    hwloc_obj_t obj;
-    struct hwloc_pcidev_attr_s *attr;
-    unsigned os_index;
-    char path[64];
-    char value[16];
-    size_t read;
-    FILE *file;
-
-    if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
-      continue;
-
-    os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
-    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
-    if (!obj)
-      break;
-    attr = &obj->attr->pcidev;
-
-    attr->domain = domain;
-    attr->bus = bus;
-    attr->dev = dev;
-    attr->func = func;
-
-    /* default (unknown) values */
-    attr->vendor_id = 0;
-    attr->device_id = 0;
-    attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
-    attr->revision = 0;
-    attr->subvendor_id = 0;
-    attr->subdevice_id = 0;
-    attr->linkspeed = 0;
-
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-      read = fread(value, 1, sizeof(value), file);
-      fclose(file);
-      if (read)
-        attr->vendor_id = strtoul(value, NULL, 16);
-    }
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-      read = fread(value, 1, sizeof(value), file);
-      fclose(file);
-      if (read)
-        attr->device_id = strtoul(value, NULL, 16);
-    }
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-      read = fread(value, 1, sizeof(value), file);
-      fclose(file);
-      if (read)
-        attr->class_id = strtoul(value, NULL, 16) >> 8;
-    }
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-      read = fread(value, 1, sizeof(value), file);
-      fclose(file);
-      if (read)
-        attr->subvendor_id = strtoul(value, NULL, 16);
-    }
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-      read = fread(value, 1, sizeof(value), file);
-      fclose(file);
-      if (read)
-        attr->subdevice_id = strtoul(value, NULL, 16);
-    }
-
-    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
-    file = hwloc_fopen(path, "r", root_fd);
-    if (file) {
-#define CONFIG_SPACE_CACHESIZE 256
-      unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
-      unsigned offset;
-
-      /* initialize the config space in case we fail to read it (missing permissions, etc). */
-      memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
-      read = fread(config_space_cache, 1, CONFIG_SPACE_CACHESIZE, file);
-      (void) read; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
-      fclose(file);
-
-      /* is this a bridge? */
-      hwloc_pci_prepare_bridge(obj, config_space_cache);
-
-      /* get the revision */
-      attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
-
-      /* try to get the link speed */
-      offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
-      if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
-	hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
-    }
-
-    if (first_obj)
-      last_obj->next_sibling = obj;
-    else
-      first_obj = obj;
-    last_obj = obj;
-  }
-
-  closedir(dir);
-
-  dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
-  if (dir) {
-    while ((dirent = readdir(dir)) != NULL) {
-      char path[64];
-      FILE *file;
-      if (dirent->d_name[0] == '.')
-	continue;
-      snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
-      file = hwloc_fopen(path, "r", root_fd);
-      if (file) {
-	unsigned domain, bus, dev;
-	if (fscanf(file, "%x:%x:%x", &domain, &bus, &dev) == 3) {
-	  hwloc_obj_t obj = first_obj;
-	  while (obj) {
-	    if (obj->attr->pcidev.domain == domain
-		&& obj->attr->pcidev.bus == bus
-		&& obj->attr->pcidev.dev == dev
-		&& obj->attr->pcidev.func == 0) {
-	      hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
-	      break;
-	    }
-	    obj = obj->next_sibling;
-	  }
-	}
-	fclose(file);
-      }
-    }
-    closedir(dir);
-  }
-
-  res = hwloc_insert_pci_device_list(backend, first_obj);
-
- out_with_rootfd:
-  close(root_fd);
-  return res;
-}
-
-static struct hwloc_backend *
-hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
-				     const void *_data1 __hwloc_attribute_unused,
-				     const void *_data2 __hwloc_attribute_unused,
-				     const void *_data3 __hwloc_attribute_unused)
-{
-  struct hwloc_backend *backend;
-
-  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
-
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
-  backend->discover = hwloc_look_linuxfs_pci;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_MISC,
-  "linuxpci",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  hwloc_linuxpci_component_instantiate,
-  19, /* after pci */
-  NULL
-};
-
-const struct hwloc_component hwloc_linuxpci_component = {
-  HWLOC_COMPONENT_ABI,
-  NULL, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_linuxpci_disc_component
-};
-
-#endif /* HWLOC_HAVE_LINUXPCI */
diff --git a/ext/hwloc/hwloc/topology-noos.c b/ext/hwloc/hwloc/topology-noos.c
index a926428e9..174b6fd8c 100644
--- a/ext/hwloc/hwloc/topology-noos.c
+++ b/ext/hwloc/hwloc/topology-noos.c
@@ -1,39 +1,55 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
 
 static int
-hwloc_look_noos(struct hwloc_backend *backend)
+hwloc_look_noos(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
 {
+  /*
+   * This backend uses the underlying OS.
+   * However we don't enforce topology->is_thissystem so that
+   * we may still force use this backend when debugging with !thissystem.
+   */
+
   struct hwloc_topology *topology = backend->topology;
+  int nbprocs;
+
+  assert(dstatus->phase == HWLOC_DISC_PHASE_CPU);
 
   if (topology->levels[0][0]->cpuset)
     /* somebody discovered things */
-    return 0;
+    return -1;
+
+  nbprocs = hwloc_fallback_nbprocessors(0);
+  if (nbprocs >= 1)
+    topology->support.discovery->pu = 1;
+  else
+    nbprocs = 1;
 
-  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
-  if (topology->is_thissystem)
-    hwloc_add_uname_info(topology, NULL);
-  return 1;
+  hwloc_alloc_root_sets(topology->levels[0][0]);
+  hwloc_setup_pu_level(topology, nbprocs);
+  hwloc_add_uname_info(topology, NULL);
+  return 0;
 }
 
 static struct hwloc_backend *
-hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
+hwloc_noos_component_instantiate(struct hwloc_topology *topology,
+				 struct hwloc_disc_component *component,
+				 unsigned excluded_phases __hwloc_attribute_unused,
 				 const void *_data1 __hwloc_attribute_unused,
 				 const void *_data2 __hwloc_attribute_unused,
 				 const void *_data3 __hwloc_attribute_unused)
 {
   struct hwloc_backend *backend;
-  backend = hwloc_backend_alloc(component);
+  backend = hwloc_backend_alloc(topology, component);
   if (!backend)
     return NULL;
   backend->discover = hwloc_look_noos;
@@ -41,11 +57,12 @@ hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
 }
 
 static struct hwloc_disc_component hwloc_noos_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
   "no_os",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  HWLOC_DISC_PHASE_CPU,
+  HWLOC_DISC_PHASE_GLOBAL,
   hwloc_noos_component_instantiate,
   40, /* lower than native OS component, higher than globals */
+  1,
   NULL
 };
 
diff --git a/ext/hwloc/hwloc/topology-opencl.cb b/ext/hwloc/hwloc/topology-opencl.cb
deleted file mode 100644
index 85057c7c1..000000000
--- a/ext/hwloc/hwloc/topology-opencl.cb
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright © 2012-2014 Inria.  All rights reserved.
- * Copyright © 2013 Université Bordeaux.  All right reserved.
- * See COPYING in top-level directory.
- */
-
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <hwloc/plugins.h>
-
-/* private headers allowed for convenience because this plugin is built within hwloc */
-#include <private/misc.h>
-#include <private/debug.h>
-
-#include <CL/cl_ext.h>
-
-typedef enum hwloc_opencl_device_type_e {
-  HWLOC_OPENCL_DEVICE_AMD
-} hwloc_opencl_device_type_t;
-
-struct hwloc_opencl_backend_data_s {
-  unsigned nr_devices; /* -1 when unknown yet, first callback will setup */
-  struct hwloc_opencl_device_info_s {
-    hwloc_opencl_device_type_t type;
-
-    unsigned platformidx;
-    char platformname[64];
-    unsigned platformdeviceidx;
-    char devicename[64];
-    char devicevendor[64];
-    char devicetype[64];
-
-    unsigned computeunits;
-    unsigned long long globalmemsize;
-
-    union hwloc_opencl_device_info_u {
-      struct hwloc_opencl_device_info_amd_s {
-        unsigned pcidomain, pcibus, pcidev, pcifunc;
-      } amd;
-    } specific;
-  } * devices;
-};
-
-static void
-hwloc_opencl_query_devices(struct hwloc_opencl_backend_data_s *data)
-{
-  cl_platform_id *platform_ids = NULL;
-  cl_uint nr_platforms;
-  cl_device_id *device_ids = NULL;
-  cl_uint nr_devices, nr_total_devices, tmp;
-  cl_int clret;
-  unsigned curpfidx, curpfdvidx, i;
-
-  /* mark the number of devices as 0 in case we fail below,
-   * so that we don't try again later.
-   */
-  data->nr_devices = 0;
-
-  /* count platforms, allocate and get them */
-  clret = clGetPlatformIDs(0, NULL, &nr_platforms);
-  if (CL_SUCCESS != clret || !nr_platforms)
-    goto out;
-  hwloc_debug("%u OpenCL platforms\n", nr_platforms);
-  platform_ids = malloc(nr_platforms * sizeof(*platform_ids));
-  if (!platform_ids)
-    goto out;
-  clret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
-  if (CL_SUCCESS != clret || !nr_platforms)
-    goto out_with_platform_ids;
-
-  /* how many devices, total? */
-  tmp = 0;
-  for(i=0; i<nr_platforms; i++) {
-    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nr_devices);
-    if (CL_SUCCESS != clret)
-      goto out_with_platform_ids;
-    tmp += nr_devices;
-  }
-  nr_total_devices = tmp;
-  hwloc_debug("%u OpenCL devices total\n", nr_total_devices);
-  /* allocate structs */
-  device_ids = malloc(nr_total_devices * sizeof(*device_ids));
-  data->devices = malloc(nr_total_devices * sizeof(*data->devices));
-  if (!data->devices || !device_ids)
-    goto out_with_device_ids;
-  /* actually query device ids */
-  tmp = 0;
-  for(i=0; i<nr_platforms; i++) {
-    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nr_total_devices - tmp, device_ids + tmp, &nr_devices);
-    if (CL_SUCCESS != clret)
-      goto out_with_device_ids;
-    tmp += nr_devices;
-  }
-
-  /* query individual devices */
-  curpfidx = 0;
-  curpfdvidx = 0;
-  for(i=0; i<nr_total_devices; i++) {
-    struct hwloc_opencl_device_info_s *info = &data->devices[data->nr_devices];
-    cl_platform_id platform_id = 0;
-    cl_device_type type;
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-    cl_device_topology_amd amdtopo;
-#endif
-    cl_ulong globalmemsize;
-    cl_uint computeunits;
-
-    hwloc_debug("Looking device %p\n", device_ids[i]);
-
-    info->platformname[0] = '\0';
-    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL);
-    if (CL_SUCCESS != clret)
-      continue;
-    clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(info->platformname), info->platformname, NULL);
-
-    info->devicename[0] = '\0';
-#ifdef CL_DEVICE_BOARD_NAME_AMD
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_BOARD_NAME_AMD, sizeof(info->devicename), info->devicename, NULL);
-#else
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(info->devicename), info->devicename, NULL);
-#endif
-    info->devicevendor[0] = '\0';
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, sizeof(info->devicevendor), info->devicevendor, NULL);
-
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL);
-    switch (type) {
-    case CL_DEVICE_TYPE_CPU: /* FIXME: cannot happen in PCI devices? */
-      strcpy(info->devicetype, "CPU");
-      break;
-    case CL_DEVICE_TYPE_GPU:
-      strcpy(info->devicetype, "GPU");
-      break;
-    case CL_DEVICE_TYPE_ACCELERATOR:
-      strcpy(info->devicetype, "Accelerator");
-      break;
-    default:
-      strcpy(info->devicetype, "Unknown");
-      break;
-    }
-
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemsize), &globalmemsize, NULL);
-    info->globalmemsize = globalmemsize / 1024;
-
-    clGetDeviceInfo(device_ids[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeunits), &computeunits, NULL);
-    info->computeunits = computeunits;
-
-    hwloc_debug("platform %s device %s vendor %s type %s\n", info->platformname, info->devicename, info->devicevendor, info->devicetype);
-
-    /* find our indexes */
-    while (platform_id != platform_ids[curpfidx]) {
-      curpfidx++;
-      curpfdvidx = 0;
-    }
-    info->platformidx = curpfidx;
-    info->platformdeviceidx = curpfdvidx;
-    curpfdvidx++;
-
-    hwloc_debug("This is opencl%dd%d\n", info->platformidx, info->platformdeviceidx);
-
-#ifdef CL_DEVICE_TOPOLOGY_AMD
-    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
-    if (CL_SUCCESS != clret) {
-      hwloc_debug("no AMD-specific device information: %d\n", clret);
-      continue;
-    }
-    if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
-      hwloc_debug("not a PCIe device: %u\n", amdtopo.raw.type);
-      continue;
-    }
-
-    info->type = HWLOC_OPENCL_DEVICE_AMD;
-    info->specific.amd.pcidomain = 0;
-    info->specific.amd.pcibus = amdtopo.pcie.bus;
-    info->specific.amd.pcidev = amdtopo.pcie.device;
-    info->specific.amd.pcifunc = amdtopo.pcie.function;
-
-    hwloc_debug("OpenCL device on PCI 0000:%02x:%02x.%u\n", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
-
-    /* validate this device */
-    data->nr_devices++;
-#endif /* HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
-  }
-  free(device_ids);
-  free(platform_ids);
-  return;
-
-out_with_device_ids:
-  free(device_ids);
-  free(data->devices);
-  data->devices = NULL;
-out_with_platform_ids:
-  free(platform_ids);
-out:
-  return;
-}
-
-static int
-hwloc_opencl_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
-				       struct hwloc_obj *pcidev)
-{
-  struct hwloc_topology *topology = backend->topology;
-  struct hwloc_opencl_backend_data_s *data = backend->private_data;
-  unsigned i;
-
-  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
-    return 0;
-
-  if (!hwloc_topology_is_thissystem(topology)) {
-    hwloc_debug("%s", "\nno OpenCL detection (not thissystem)\n");
-    return 0;
-  }
-
-  if (HWLOC_OBJ_PCI_DEVICE != pcidev->type)
-    return 0;
-
-  if (data->nr_devices == (unsigned) -1) {
-    /* first call, lookup all devices */
-    hwloc_opencl_query_devices(data);
-    /* if it fails, data->nr_devices = 0 so we won't do anything below and in next callbacks */
-  }
-
-  if (!data->nr_devices)
-    /* found no devices */
-    return 0;
-
-  /* now the devices array is ready to use */
-  for(i=0; i<data->nr_devices; i++) {
-    struct hwloc_opencl_device_info_s *info = &data->devices[i];
-    hwloc_obj_t osdev;
-    char buffer[64];
-
-    assert(info->type == HWLOC_OPENCL_DEVICE_AMD);
-    if (info->specific.amd.pcidomain != pcidev->attr->pcidev.domain)
-      continue;
-    if (info->specific.amd.pcibus != pcidev->attr->pcidev.bus)
-      continue;
-    if (info->specific.amd.pcidev != pcidev->attr->pcidev.dev)
-      continue;
-    if (info->specific.amd.pcifunc != pcidev->attr->pcidev.func)
-      continue;
-
-    osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
-    snprintf(buffer, sizeof(buffer), "opencl%dd%d", info->platformidx, info->platformdeviceidx);
-    osdev->name = strdup(buffer);
-    osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
-    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC;
-
-    hwloc_obj_add_info(osdev, "CoProcType", "OpenCL");
-    hwloc_obj_add_info(osdev, "Backend", "OpenCL");
-    hwloc_obj_add_info(osdev, "OpenCLDeviceType", info->devicetype);
-
-    if (info->devicevendor[0] != '\0')
-      hwloc_obj_add_info(osdev, "GPUVendor", info->devicevendor);
-    if (info->devicename[0] != '\0')
-      hwloc_obj_add_info(osdev, "GPUModel", info->devicename);
-
-    snprintf(buffer, sizeof(buffer), "%u", info->platformidx);
-    hwloc_obj_add_info(osdev, "OpenCLPlatformIndex", buffer);
-    if (info->platformname[0] != '\0')
-      hwloc_obj_add_info(osdev, "OpenCLPlatformName", info->platformname);
-
-    snprintf(buffer, sizeof(buffer), "%u", info->platformdeviceidx);
-    hwloc_obj_add_info(osdev, "OpenCLPlatformDeviceIndex", buffer);
-
-    snprintf(buffer, sizeof(buffer), "%u", info->computeunits);
-    hwloc_obj_add_info(osdev, "OpenCLComputeUnits", buffer);
-
-    snprintf(buffer, sizeof(buffer), "%llu", info->globalmemsize);
-    hwloc_obj_add_info(osdev, "OpenCLGlobalMemorySize", buffer);
-
-    hwloc_insert_object_by_parent(topology, pcidev, osdev);
-    return 1;
-  }
-
-  return 0;
-}
-
-static void
-hwloc_opencl_backend_disable(struct hwloc_backend *backend)
-{
-  struct hwloc_opencl_backend_data_s *data = backend->private_data;
-  free(data->devices);
-  free(data);
-}
-
-static struct hwloc_backend *
-hwloc_opencl_component_instantiate(struct hwloc_disc_component *component,
-				   const void *_data1 __hwloc_attribute_unused,
-				   const void *_data2 __hwloc_attribute_unused,
-				   const void *_data3 __hwloc_attribute_unused)
-{
-  struct hwloc_backend *backend;
-  struct hwloc_opencl_backend_data_s *data;
-
-  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
-
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-
-  data = malloc(sizeof(*data));
-  if (!data) {
-    free(backend);
-    return NULL;
-  }
-  /* the first callback will initialize those */
-  data->nr_devices = (unsigned) -1; /* unknown yet */
-  data->devices = NULL;
-
-  backend->private_data = data;
-  backend->disable = hwloc_opencl_backend_disable;
-
-  backend->notify_new_object = hwloc_opencl_backend_notify_new_object;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_opencl_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_MISC,
-  "opencl",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  hwloc_opencl_component_instantiate,
-  10, /* after pci */
-  NULL
-};
-
-static int
-hwloc_opencl_component_init(unsigned long flags)
-{
-  if (flags)
-    return -1;
-  if (hwloc_plugin_check_namespace("opencl", "hwloc_backend_alloc") < 0)
-    return -1;
-  return 0;
-}
-
-#ifdef HWLOC_INSIDE_PLUGIN
-HWLOC_DECLSPEC extern const struct hwloc_component hwloc_opencl_component;
-#endif
-
-const struct hwloc_component hwloc_opencl_component = {
-  HWLOC_COMPONENT_ABI,
-  hwloc_opencl_component_init, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_opencl_disc_component
-};
diff --git a/ext/hwloc/hwloc/topology-osf.cb b/ext/hwloc/hwloc/topology-osf.cb
deleted file mode 100644
index 57158883d..000000000
--- a/ext/hwloc/hwloc/topology-osf.cb
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
- * Copyright © 2009-2011 Université Bordeaux
- * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
- * See COPYING in top-level directory.
- */
-
-#include <private/autogen/config.h>
-
-#include <sys/types.h>
-#ifdef HAVE_DIRENT_H
-#include <dirent.h>
-#endif
-#ifdef HAVE_UNISTD_H
-#include <unistd.h>
-#endif
-#include <string.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <pthread.h>
-
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-
-#include <numa.h>
-#include <radset.h>
-#include <cpuset.h>
-#include <sys/mman.h>
-
-/*
- * TODO
- *
- * nsg_init(), nsg_attach_pid(), RAD_MIGRATE/RAD_WAIT
- * assign_pid_to_pset()
- *
- * pthread_use_only_cpu too?
- */
-
-static int
-prepare_radset(hwloc_topology_t topology __hwloc_attribute_unused, radset_t *radset, hwloc_const_bitmap_t hwloc_set)
-{
-  unsigned cpu;
-  cpuset_t target_cpuset;
-  cpuset_t cpuset, xor_cpuset;
-  radid_t radid;
-  int ret = 0;
-  int ret_errno = 0;
-  int nbnodes = rad_get_num();
-
-  cpusetcreate(&target_cpuset);
-  cpuemptyset(target_cpuset);
-  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
-    cpuaddset(target_cpuset, cpu);
-  hwloc_bitmap_foreach_end();
-
-  cpusetcreate(&cpuset);
-  cpusetcreate(&xor_cpuset);
-  for (radid = 0; radid < nbnodes; radid++) {
-    cpuemptyset(cpuset);
-    if (rad_get_cpus(radid, cpuset)==-1) {
-      fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
-      continue;
-    }
-    cpuxorset(target_cpuset, cpuset, xor_cpuset);
-    if (cpucountset(xor_cpuset) == 0) {
-      /* Found it */
-      radsetcreate(radset);
-      rademptyset(*radset);
-      radaddset(*radset, radid);
-      ret = 1;
-      goto out;
-    }
-  }
-  /* radset containing exactly this set of CPUs not found */
-  ret_errno = EXDEV;
-
-out:
-  cpusetdestroy(&target_cpuset);
-  cpusetdestroy(&cpuset);
-  cpusetdestroy(&xor_cpuset);
-  errno = ret_errno;
-  return ret;
-}
-
-/* Note: get_cpubind not available on OSF */
-
-static int
-hwloc_osf_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  radset_t radset;
-
-  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
-    if ((errno = pthread_rad_detach(thread)))
-      return -1;
-    return 0;
-  }
-
-  /* Apparently OSF migrates pages */
-  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
-    errno = ENOSYS;
-    return -1;
-  }
-
-  if (!prepare_radset(topology, &radset, hwloc_set))
-    return -1;
-
-  if (flags & HWLOC_CPUBIND_STRICT) {
-    if ((errno = pthread_rad_bind(thread, radset, RAD_INSIST | RAD_WAIT)))
-      return -1;
-  } else {
-    if ((errno = pthread_rad_attach(thread, radset, RAD_WAIT)))
-      return -1;
-  }
-  radsetdestroy(&radset);
-
-  return 0;
-}
-
-static int
-hwloc_osf_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  radset_t radset;
-
-  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
-    if (rad_detach_pid(pid))
-      return -1;
-    return 0;
-  }
-
-  /* Apparently OSF migrates pages */
-  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
-    errno = ENOSYS;
-    return -1;
-  }
-
-  if (!prepare_radset(topology, &radset, hwloc_set))
-    return -1;
-
-  if (flags & HWLOC_CPUBIND_STRICT) {
-    if (rad_bind_pid(pid, radset, RAD_INSIST | RAD_WAIT))
-      return -1;
-  } else {
-    if (rad_attach_pid(pid, radset, RAD_WAIT))
-      return -1;
-  }
-  radsetdestroy(&radset);
-
-  return 0;
-}
-
-static int
-hwloc_osf_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  return hwloc_osf_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
-}
-
-static int
-hwloc_osf_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
-{
-  return hwloc_osf_set_proc_cpubind(topology, getpid(), hwloc_set, flags);
-}
-
-static int
-hwloc_osf_prepare_mattr(hwloc_topology_t topology __hwloc_attribute_unused, memalloc_attr_t *mattr, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags __hwloc_attribute_unused)
-{
-  unsigned long osf_policy;
-  int node;
-
-  switch (policy) {
-    case HWLOC_MEMBIND_FIRSTTOUCH:
-      osf_policy = MPOL_THREAD;
-      break;
-    case HWLOC_MEMBIND_DEFAULT:
-    case HWLOC_MEMBIND_BIND:
-      osf_policy = MPOL_DIRECTED;
-      break;
-    case HWLOC_MEMBIND_INTERLEAVE:
-      osf_policy = MPOL_STRIPPED;
-      break;
-    case HWLOC_MEMBIND_REPLICATE:
-      osf_policy = MPOL_REPLICATED;
-      break;
-    default:
-      errno = ENOSYS;
-      return -1;
-  }
-
-  memset(mattr, 0, sizeof(*mattr));
-  mattr->mattr_policy = osf_policy;
-  mattr->mattr_rad = RAD_NONE;
-  radsetcreate(&mattr->mattr_radset);
-  rademptyset(mattr->mattr_radset);
-
-  hwloc_bitmap_foreach_begin(node, nodeset)
-    radaddset(mattr->mattr_radset, node);
-  hwloc_bitmap_foreach_end();
-  return 0;
-}
-
-static int
-hwloc_osf_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
-{
-  memalloc_attr_t mattr;
-  int behavior = 0;
-  int ret;
-
-  if (flags & HWLOC_MEMBIND_MIGRATE)
-    behavior |= MADV_CURRENT;
-  if (flags & HWLOC_MEMBIND_STRICT)
-    behavior |= MADV_INSIST;
-
-  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
-    return -1;
-
-  ret = nmadvise(addr, len, MADV_CURRENT, &mattr);
-  radsetdestroy(&mattr.mattr_radset);
-  return ret;
-}
-
-static void *
-hwloc_osf_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
-{
-  memalloc_attr_t mattr;
-  void *ptr;
-
-  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
-    return hwloc_alloc_or_fail(topology, len, flags);
-
-  /* TODO: rather use acreate/amalloc ? */
-  ptr = nmmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1,
-               0, &mattr);
-  radsetdestroy(&mattr.mattr_radset);
-  return ptr;
-}
-
-static int
-hwloc_look_osf(struct hwloc_backend *backend)
-{
-  struct hwloc_topology *topology = backend->topology;
-  cpu_cursor_t cursor;
-  unsigned nbnodes;
-  radid_t radid, radid2;
-  radset_t radset, radset2;
-  cpuid_t cpuid;
-  cpuset_t cpuset;
-  struct hwloc_obj *obj;
-  unsigned distance;
-
-  if (topology->levels[0][0]->cpuset)
-    /* somebody discovered things */
-    return 0;
-
-  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
-
-  nbnodes = rad_get_num();
-
-  cpusetcreate(&cpuset);
-  radsetcreate(&radset);
-  radsetcreate(&radset2);
-  {
-    hwloc_obj_t *nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
-    unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
-    float *distances = calloc(nbnodes*nbnodes, sizeof(float));
-    unsigned nfound;
-    numa_attr_t attr;
-
-    attr.nattr_type = R_RAD;
-    attr.nattr_descr.rd_radset = radset;
-    attr.nattr_flags = 0;
-
-    for (radid = 0; radid < (radid_t) nbnodes; radid++) {
-      rademptyset(radset);
-      radaddset(radset, radid);
-      cpuemptyset(cpuset);
-      if (rad_get_cpus(radid, cpuset)==-1) {
-	fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
-	continue;
-      }
-
-      indexes[radid] = radid;
-      nodes[radid] = obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, radid);
-      obj->nodeset = hwloc_bitmap_alloc();
-      hwloc_bitmap_set(obj->nodeset, radid);
-      obj->cpuset = hwloc_bitmap_alloc();
-      obj->memory.local_memory = rad_get_physmem(radid) * hwloc_getpagesize();
-      obj->memory.page_types_len = 2;
-      obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
-      memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
-      obj->memory.page_types[0].size = hwloc_getpagesize();
-#ifdef HAVE__SC_LARGE_PAGESIZE
-      obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
-#endif
-
-      cursor = SET_CURSOR_INIT;
-      while((cpuid = cpu_foreach(cpuset, 0, &cursor)) != CPU_NONE)
-	hwloc_bitmap_set(obj->cpuset, cpuid);
-
-      hwloc_debug_1arg_bitmap("node %d has cpuset %s\n",
-		 radid, obj->cpuset);
-
-      hwloc_insert_object_by_cpuset(topology, obj);
-
-      nfound = 0;
-      for (radid2 = 0; radid2 < (radid_t) nbnodes; radid2++)
-	distances[radid*nbnodes+radid2] = RAD_DIST_REMOTE;
-      for (distance = RAD_DIST_LOCAL; distance < RAD_DIST_REMOTE; distance++) {
-	attr.nattr_distance = distance;
-	/* get set of NUMA nodes at distance <= DISTANCE */
-	if (nloc(&attr, radset2)) {
-	  fprintf(stderr,"nloc failed: %s\n", strerror(errno));
-	  continue;
-	}
-	cursor = SET_CURSOR_INIT;
-	while ((radid2 = rad_foreach(radset2, 0, &cursor)) != RAD_NONE) {
-	  if (distances[radid*nbnodes+radid2] == RAD_DIST_REMOTE) {
-            distances[radid*nbnodes+radid2] = (float) distance;
-	    nfound++;
-	  }
-	}
-	if (nfound == nbnodes)
-	  /* Finished finding distances, no need to go up to RAD_DIST_REMOTE */
-	  break;
-      }
-    }
-
-    hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
-  }
-  radsetdestroy(&radset2);
-  radsetdestroy(&radset);
-  cpusetdestroy(&cpuset);
-
-  /* add PU objects */
-  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
-
-  hwloc_obj_add_info(topology->levels[0][0], "Backend", "OSF");
-  if (topology->is_thissystem)
-    hwloc_add_uname_info(topology, NULL);
-  return 1;
-}
-
-void
-hwloc_set_osf_hooks(struct hwloc_binding_hooks *hooks,
-		    struct hwloc_topology_support *support)
-{
-  hooks->set_thread_cpubind = hwloc_osf_set_thread_cpubind;
-  hooks->set_thisthread_cpubind = hwloc_osf_set_thisthread_cpubind;
-  hooks->set_proc_cpubind = hwloc_osf_set_proc_cpubind;
-  hooks->set_thisproc_cpubind = hwloc_osf_set_thisproc_cpubind;
-  hooks->set_area_membind = hwloc_osf_set_area_membind;
-  hooks->alloc_membind = hwloc_osf_alloc_membind;
-  hooks->alloc = hwloc_alloc_mmap;
-  hooks->free_membind = hwloc_free_mmap;
-  support->membind->firsttouch_membind = 1;
-  support->membind->bind_membind = 1;
-  support->membind->interleave_membind = 1;
-  support->membind->replicate_membind = 1;
-}
-
-static struct hwloc_backend *
-hwloc_osf_component_instantiate(struct hwloc_disc_component *component,
-				const void *_data1 __hwloc_attribute_unused,
-				const void *_data2 __hwloc_attribute_unused,
-				const void *_data3 __hwloc_attribute_unused)
-{
-  struct hwloc_backend *backend;
-  backend = hwloc_backend_alloc(component);
-  if (!backend)
-    return NULL;
-  backend->discover = hwloc_look_osf;
-  return backend;
-}
-
-static struct hwloc_disc_component hwloc_osf_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
-  "osf",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
-  hwloc_osf_component_instantiate,
-  50,
-  NULL
-};
-
-const struct hwloc_component hwloc_osf_component = {
-  HWLOC_COMPONENT_ABI,
-  NULL, NULL,
-  HWLOC_COMPONENT_TYPE_DISC,
-  0,
-  &hwloc_osf_disc_component
-};
diff --git a/ext/hwloc/hwloc/topology-synthetic.c b/ext/hwloc/hwloc/topology-synthetic.c
index 237729a89..686efce1f 100644
--- a/ext/hwloc/hwloc/topology-synthetic.c
+++ b/ext/hwloc/hwloc/topology-synthetic.c
@@ -1,16 +1,16 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2010 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/misc.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/misc.h"
+#include "private/debug.h"
 
 #include <limits.h>
 #include <assert.h>
@@ -18,27 +18,45 @@
 #include <strings.h>
 #endif
 
-struct hwloc_synthetic_level_data_s {
-  unsigned arity;
-  unsigned long totalwidth;
+struct hwloc_synthetic_attr_s {
   hwloc_obj_type_t type;
   unsigned depth; /* For caches/groups */
   hwloc_obj_cache_type_t cachetype; /* For caches */
   hwloc_uint64_t memorysize; /* For caches/memory */
+};
 
+struct hwloc_synthetic_indexes_s {
   /* the indexes= attribute before parsing */
-  const char *index_string;
-  unsigned long index_string_length;
+  const char *string;
+  unsigned long string_length;
   /* the array of explicit indexes after parsing */
-  unsigned *index_array;
+  unsigned *array;
 
   /* used while filling the topology */
-  unsigned next_os_index; /* id of the next object for that level */
+  unsigned next; /* id of the next object for that level */
+};
+
+struct hwloc_synthetic_level_data_s {
+  unsigned arity;
+  unsigned long totalwidth;
+
+  struct hwloc_synthetic_attr_s attr;
+  struct hwloc_synthetic_indexes_s indexes;
+
+  struct hwloc_synthetic_attached_s {
+    struct hwloc_synthetic_attr_s attr;
+
+    struct hwloc_synthetic_attached_s *next;
+  } *attached;
 };
 
 struct hwloc_synthetic_backend_data_s {
   /* synthetic backend parameters */
   char *string;
+
+  unsigned long numa_attached_nr;
+  struct hwloc_synthetic_indexes_s numa_attached_indexes;
+
 #define HWLOC_SYNTHETIC_MAX_DEPTH 128
   struct hwloc_synthetic_level_data_s level[HWLOC_SYNTHETIC_MAX_DEPTH];
 };
@@ -50,17 +68,15 @@ struct hwloc_synthetic_intlv_loop_s {
 };
 
 static void
-hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *data,
-				      unsigned curleveldepth,
-				      int verbose)
+hwloc_synthetic_process_indexes(struct hwloc_synthetic_backend_data_s *data,
+				struct hwloc_synthetic_indexes_s *indexes,
+				unsigned long total,
+				int verbose)
 {
-  struct hwloc_synthetic_level_data_s *curlevel = &data->level[curleveldepth];
-  unsigned long total = curlevel->totalwidth;
-  const char *attr = curlevel->index_string;
-  unsigned long length = curlevel->index_string_length;
+  const char *attr = indexes->string;
+  unsigned long length = indexes->string_length;
   unsigned *array = NULL;
-  struct hwloc_synthetic_intlv_loop_s * loops = NULL;
-  unsigned long i;
+  size_t i;
 
   if (!attr)
     return;
@@ -81,7 +97,7 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
       unsigned idx = strtoul(attr, (char **) &next, 10);
       if (next == attr) {
 	if (verbose)
-	  fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", i, attr);
+	  fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", (unsigned long) i, attr);
 	goto out_with_array;
       }
 
@@ -89,7 +105,7 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
       if (i != total-1) {
 	if (*next != ',') {
 	  if (verbose)
-	    fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", i, attr);
+	    fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", (unsigned long) i, attr);
 	  goto out_with_array;
 	}
 	attr = next+1;
@@ -97,7 +113,7 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
 	attr = next;
       }
     }
-    curlevel->index_array = array;
+    indexes->array = array;
 
   } else {
     /* interleaving */
@@ -106,6 +122,7 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
     unsigned long nbs = 1;
     unsigned j, mul;
     const char *tmp;
+    struct hwloc_synthetic_intlv_loop_s *loops;
 
     tmp = attr;
     while (tmp) {
@@ -115,13 +132,11 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
       nr_loops++;
       tmp++;
     }
+
     /* nr_loops colon-separated fields, but we may need one more at the end */
-    loops = malloc((nr_loops+1)*sizeof(*loops));
-    if (!loops) {
-      if (verbose)
-	fprintf(stderr, "Failed to allocate synthetic index interleave loop array of size %u\n", nr_loops);
+    loops = malloc((nr_loops+1) * sizeof(*loops));
+    if (!loops)
       goto out_with_array;
-    }
 
     if (*attr >= '0' && *attr <= '9') {
       /* interleaving as x*y:z*t:... */
@@ -135,24 +150,28 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
 	if (tmp2 == tmp || *tmp2 != '*') {
 	  if (verbose)
 	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
 	if (!step) {
 	  if (verbose)
 	    fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
 	tmp2++;
 	nb = (unsigned) strtol(tmp2, &tmp3, 0);
 	if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) {
 	  if (verbose)
 	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
 	if (!nb) {
 	  if (verbose)
 	    fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
 	loops[cur_loop].step = step;
 	loops[cur_loop].nb = nb;
@@ -168,44 +187,46 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
     } else {
       /* interleaving as type1:type2:... */
       hwloc_obj_type_t type;
-      hwloc_obj_cache_type_t cachetypeattr;
-      int depthattr;
+      union hwloc_obj_attr_u attrs;
       int err;
 
       /* find level depths for each interleaving loop */
       tmp = attr;
       cur_loop = 0;
       while (tmp) {
-	err = hwloc_obj_type_sscanf(tmp, &type, &depthattr, &cachetypeattr, sizeof(cachetypeattr));
+	err = hwloc_type_sscanf(tmp, &type, &attrs, sizeof(attrs));
 	if (err < 0) {
 	  if (verbose)
 	    fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
 	if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
 	  if (verbose)
 	    fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp);
-	  goto out_with_loops;
+	  free(loops);
+	  goto out_with_array;
 	}
-	for(i=0; i<curleveldepth; i++) {
-	  if (type != data->level[i].type)
-	    continue;
-	  if ((type == HWLOC_OBJ_GROUP || type == HWLOC_OBJ_CACHE)
-	      && depthattr != -1
-	      && (unsigned) depthattr != data->level[i].depth)
+	for(i=0; ; i++) {
+	  if (!data->level[i].arity) {
+	    loops[cur_loop].level_depth = (unsigned)-1;
+	    break;
+	  }
+	  if (type != data->level[i].attr.type)
 	    continue;
-	  if (type == HWLOC_OBJ_CACHE
-	      && cachetypeattr != (hwloc_obj_cache_type_t) -1
-	      && cachetypeattr != data->level[i].cachetype)
+	  if (type == HWLOC_OBJ_GROUP
+	      && attrs.group.depth != (unsigned) -1
+	      && attrs.group.depth != data->level[i].attr.depth)
 	    continue;
-	  loops[cur_loop].level_depth = i;
+	  loops[cur_loop].level_depth = (unsigned)i;
 	  break;
 	}
-	if (i == curleveldepth) {
+	if (loops[cur_loop].level_depth == (unsigned)-1) {
 	  if (verbose)
-	    fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s' above '%s'\n",
-		    tmp, hwloc_obj_type_string(curlevel->type));
-	  goto out_with_loops;
+	    fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s'\n",
+		    tmp);
+	  free(loops);
+	  goto out_with_array;
 	}
 	tmp = strchr(tmp, ':');
 	if (!tmp || tmp > attr+length)
@@ -223,13 +244,14 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
 	  if (loops[i].level_depth == mydepth && i != cur_loop) {
 	    if (verbose)
 	      fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr);
-	    goto out_with_loops;
+	    free(loops);
+	    goto out_with_array;
 	  }
 	  if (loops[i].level_depth < mydepth
 	      && loops[i].level_depth > prevdepth)
 	    prevdepth = loops[i].level_depth;
 	}
-	step = curlevel->totalwidth / data->level[mydepth].totalwidth; /* number of objects below us */
+	step = total / data->level[mydepth].totalwidth; /* number of objects below us */
 	nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */
 
 	loops[cur_loop].step = step;
@@ -252,7 +274,8 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
       } else {
 	if (verbose)
 	  fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total);
-	goto out_with_loops;
+	free(loops);
+	goto out_with_array;
       }
     }
 
@@ -266,28 +289,27 @@ hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *dat
       mul *= nb;
     }
 
+    free(loops);
+
     /* check that we have the right values (cannot pass total, cannot give duplicate 0) */
     for(j=0; j<total; j++) {
       if (array[j] >= total) {
 	if (verbose)
 	  fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]);
-	goto out_with_loops;
+	goto out_with_array;
       }
       if (!array[j] && j) {
 	if (verbose)
 	  fprintf(stderr, "Invalid index interleaving generates duplicate index values\n");
-	goto out_with_loops;
+	goto out_with_array;
       }
     }
 
-    free(loops);
-    curlevel->index_array = array;
+    indexes->array = array;
   }
 
   return;
 
- out_with_loops:
-  free(loops);
  out_with_array:
   free(array);
  out:
@@ -318,15 +340,16 @@ hwloc_synthetic_parse_memory_attr(const char *attr, const char **endp)
 }
 
 static int
-hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
-				  struct hwloc_synthetic_level_data_s *curlevel,
-				  int verbose)
+hwloc_synthetic_parse_attrs(const char *attrs, const char **next_posp,
+			    struct hwloc_synthetic_attr_s *sattr,
+			    struct hwloc_synthetic_indexes_s *sind,
+			    int verbose)
 {
-  hwloc_obj_type_t type = curlevel->type;
+  hwloc_obj_type_t type = sattr->type;
   const char *next_pos;
   hwloc_uint64_t memorysize = 0;
   const char *index_string = NULL;
-  unsigned long index_string_length = 0;
+  size_t index_string_length = 0;
 
   next_pos = (const char *) strchr(attrs, ')');
   if (!next_pos) {
@@ -337,10 +360,12 @@ hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
   }
 
   while (')' != *attrs) {
-    if (HWLOC_OBJ_CACHE == type && !strncmp("size=", attrs, 5)) {
+    int iscache = hwloc__obj_type_is_cache(type);
+
+    if (iscache && !strncmp("size=", attrs, 5)) {
       memorysize = hwloc_synthetic_parse_memory_attr(attrs+5, &attrs);
 
-    } else if (HWLOC_OBJ_CACHE != type && !strncmp("memory=", attrs, 7)) {
+    } else if (!iscache && !strncmp("memory=", attrs, 7)) {
       memorysize = hwloc_synthetic_parse_memory_attr(attrs+7, &attrs);
 
     } else if (!strncmp("indexes=", attrs, 8)) {
@@ -366,13 +391,39 @@ hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
     }
   }
 
-  curlevel->memorysize = memorysize;
-  curlevel->index_string = index_string;
-  curlevel->index_string_length = index_string_length;
+  sattr->memorysize = memorysize;
+
+  if (index_string) {
+    if (sind->string && verbose)
+      fprintf(stderr, "Overwriting duplicate indexes attribute with last occurence\n");
+    sind->string = index_string;
+    sind->string_length = (unsigned long)index_string_length;
+  }
+
   *next_posp = next_pos+1;
   return 0;
 }
 
+/* frees level until arity = 0 */
+static void
+hwloc_synthetic_free_levels(struct hwloc_synthetic_backend_data_s *data)
+{
+  unsigned i;
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    struct hwloc_synthetic_attached_s **pprev = &curlevel->attached;
+    while (*pprev) {
+      struct hwloc_synthetic_attached_s *cur = *pprev;
+      *pprev = cur->next;
+      free(cur);
+    }
+    free(curlevel->indexes.array);
+    if (!curlevel->arity)
+      break;
+  }
+  free(data->numa_attached_indexes.array);
+}
+
 /* Read from description a series of integers describing a symmetrical
    topology and update the hwloc_synthetic_backend_data_s accordingly.  On
    success, return zero.  */
@@ -383,9 +434,8 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
   const char *pos, *next_pos;
   unsigned long item, count;
   unsigned i;
-  int cache_depth = 0, group_depth = 0;
-  int nb_machine_levels = 0, nb_node_levels = 0;
-  int nb_pu_levels = 0;
+  int type_count[HWLOC_OBJ_TYPE_MAX];
+  unsigned unset;
   int verbose = 0;
   const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
   int err;
@@ -394,23 +444,29 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
   if (env)
     verbose = atoi(env);
 
+  data->numa_attached_nr = 0;
+  data->numa_attached_indexes.array = NULL;
+
   /* default values before we add root attributes */
   data->level[0].totalwidth = 1;
-  data->level[0].type = HWLOC_OBJ_MACHINE;
-  data->level[0].index_string = NULL;
-  data->level[0].index_array = NULL;
-  data->level[0].memorysize = 0;
+  data->level[0].attr.type = HWLOC_OBJ_MACHINE;
+  data->level[0].indexes.string = NULL;
+  data->level[0].indexes.array = NULL;
+  data->level[0].attr.memorysize = 0;
+  data->level[0].attached = NULL;
+  type_count[HWLOC_OBJ_MACHINE] = 1;
   if (*description == '(') {
-    err = hwloc_synthetic_parse_level_attrs(description+1, &description, &data->level[0], verbose);
+    err = hwloc_synthetic_parse_attrs(description+1, &description, &data->level[0].attr, &data->level[0].indexes, verbose);
     if (err < 0)
       return err;
   }
 
+  data->numa_attached_indexes.string = NULL;
+  data->numa_attached_indexes.array = NULL;
+
   for (pos = description, count = 1; *pos; pos = next_pos) {
-#define HWLOC_OBJ_TYPE_UNKNOWN ((hwloc_obj_type_t) -1)
-    hwloc_obj_type_t type = HWLOC_OBJ_TYPE_UNKNOWN;
-    int typedepth = -1;
-    hwloc_obj_cache_type_t cachetype = (hwloc_obj_cache_type_t) -1;
+    hwloc_obj_type_t type = HWLOC_OBJ_TYPE_NONE;
+    union hwloc_obj_attr_u attrs;
 
     /* initialize parent arity to 0 so that the levels are not infinite */
     data->level[count-1].arity = 0;
@@ -421,14 +477,80 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
     if (!*pos)
       break;
 
-    if (*pos < '0' || *pos > '9') {
-      if (hwloc_obj_type_sscanf(pos, &type, &typedepth, &cachetype, sizeof(cachetype)) < 0) {
+    if (*pos == '[') {
+      /* attached */
+      struct hwloc_synthetic_attached_s *attached, **pprev;
+      char *attr;
+
+      pos++;
+
+      if (hwloc_type_sscanf(pos, &type, &attrs, sizeof(attrs)) < 0) {
 	if (verbose)
-	  fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+	  fprintf(stderr, "Synthetic string with unknown attached object type at '%s'\n", pos);
 	errno = EINVAL;
 	goto error;
       }
-      if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+      if (type != HWLOC_OBJ_NUMANODE) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with disallowed attached object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      data->numa_attached_nr += data->level[count-1].totalwidth;
+
+      attached = malloc(sizeof(*attached));
+      if (attached) {
+	attached->attr.type = type;
+	attached->attr.memorysize = 0;
+	/* attached->attr.depth and .cachetype unused */
+	attached->next = NULL;
+	pprev = &data->level[count-1].attached;
+	while (*pprev)
+	  pprev = &((*pprev)->next);
+	*pprev = attached;
+      }
+
+      next_pos = strchr(pos, ']');
+      if (!next_pos) {
+	if (verbose)
+	  fprintf(stderr,"Synthetic string doesn't have a closing `]' after attached object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+
+      attr = strchr(pos, '(');
+      if (attr && attr < next_pos && attached) {
+	const char *dummy;
+	err = hwloc_synthetic_parse_attrs(attr+1, &dummy, &attached->attr, &data->numa_attached_indexes, verbose);
+	if (err < 0)
+	  goto error;
+      }
+
+      next_pos++;
+      continue;
+    }
+
+    /* normal level */
+
+    /* reset defaults */
+    data->level[count].indexes.string = NULL;
+    data->level[count].indexes.array = NULL;
+    data->level[count].attached = NULL;
+
+    if (*pos < '0' || *pos > '9') {
+      if (hwloc_type_sscanf(pos, &type, &attrs, sizeof(attrs)) < 0) {
+	if (!strncmp(pos, "Tile", 4) || !strncmp(pos, "Module", 6)) {
+	  /* possible future types */
+	  type = HWLOC_OBJ_GROUP;
+	} else {
+	  /* FIXME: allow generic "Cache" string? would require to deal with possibly duplicate cache levels */
+	  if (verbose)
+	    fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+	  errno = EINVAL;
+	  goto error;
+	}
+      }
+      if (type == HWLOC_OBJ_MACHINE || type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
 	if (verbose)
 	  fprintf(stderr, "Synthetic string with disallowed object type at '%s'\n", pos);
 	errno = EINVAL;
@@ -444,10 +566,20 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
       }
       pos = next_pos + 1;
     }
-    data->level[count].type = type;
-    data->level[count].depth = (unsigned) typedepth;
-    data->level[count].cachetype = cachetype;
 
+    data->level[count].attr.type = type;
+    data->level[count].attr.depth = (unsigned) -1;
+    data->level[count].attr.cachetype = (hwloc_obj_cache_type_t) -1;
+    if (hwloc__obj_type_is_cache(type)) {
+      /* these are always initialized */
+      data->level[count].attr.depth = attrs.cache.depth;
+      data->level[count].attr.cachetype = attrs.cache.type;
+    } else if (type == HWLOC_OBJ_GROUP) {
+      /* could be -1 but will be set below */
+      data->level[count].attr.depth = attrs.group.depth;
+    }
+
+    /* number of normal children */
     item = strtoul(pos, (char **)&next_pos, 0);
     if (next_pos == pos) {
       if (verbose)
@@ -455,15 +587,20 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
       errno = EINVAL;
       goto error;
     }
-    data->level[count-1].arity = (unsigned)item;
+    if (!item) {
+      if (verbose)
+	fprintf(stderr,"Synthetic string with disallow 0 number of objects at '%s'\n", pos);
+      errno = EINVAL;
+      goto error;
+    }
 
     totalarity *= item;
     data->level[count].totalwidth = totalarity;
-    data->level[count].index_string = NULL;
-    data->level[count].index_array = NULL;
-    data->level[count].memorysize = 0;
+    data->level[count].indexes.string = NULL;
+    data->level[count].indexes.array = NULL;
+    data->level[count].attr.memorysize = 0;
     if (*next_pos == '(') {
-      err = hwloc_synthetic_parse_level_attrs(next_pos+1, &next_pos, &data->level[count], verbose);
+      err = hwloc_synthetic_parse_attrs(next_pos+1, &next_pos, &data->level[count].attr, &data->level[count].indexes, verbose);
       if (err < 0)
 	goto error;
     }
@@ -481,208 +618,303 @@ hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
       goto error;
     }
 
+    data->level[count-1].arity = (unsigned)item;
     count++;
   }
 
-  if (count <= 0) {
+  if (data->level[count-1].attr.type != HWLOC_OBJ_TYPE_NONE && data->level[count-1].attr.type != HWLOC_OBJ_PU) {
     if (verbose)
-      fprintf(stderr, "Synthetic string doesn't contain any object\n");
+      fprintf(stderr, "Synthetic string cannot use non-PU type for last level\n");
     errno = EINVAL;
-    goto error;
+    return -1;
   }
+  data->level[count-1].attr.type = HWLOC_OBJ_PU;
 
+  for(i=HWLOC_OBJ_TYPE_MIN; i<HWLOC_OBJ_TYPE_MAX; i++) {
+    type_count[i] = 0;
+  }
   for(i=count-1; i>0; i--) {
-    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
-    hwloc_obj_type_t type;
-
-    type = curlevel->type;
-
-    if (type == HWLOC_OBJ_TYPE_UNKNOWN) {
-      if (i == count-1)
-	type = HWLOC_OBJ_PU;
-      else {
-	switch (data->level[i+1].type) {
-	case HWLOC_OBJ_PU: type = HWLOC_OBJ_CORE; break;
-	case HWLOC_OBJ_CORE: type = HWLOC_OBJ_CACHE; break;
-	case HWLOC_OBJ_CACHE: type = HWLOC_OBJ_PACKAGE; break;
-	case HWLOC_OBJ_PACKAGE: type = HWLOC_OBJ_NUMANODE; break;
-	case HWLOC_OBJ_NUMANODE:
-	case HWLOC_OBJ_MACHINE:
-	case HWLOC_OBJ_GROUP: type = HWLOC_OBJ_GROUP; break;
-	default:
-	  assert(0);
-	}
-      }
-      curlevel->type = type;
-    }
-    switch (type) {
-      case HWLOC_OBJ_PU:
-	nb_pu_levels++;
-	break;
-      case HWLOC_OBJ_CACHE:
-	cache_depth++;
-	break;
-      case HWLOC_OBJ_GROUP:
-	group_depth++;
-	break;
-      case HWLOC_OBJ_NUMANODE:
-	nb_node_levels++;
-	break;
-      case HWLOC_OBJ_MACHINE:
-	nb_machine_levels++;
-	break;
-      default:
-	break;
+    hwloc_obj_type_t type = data->level[i].attr.type;
+    if (type != HWLOC_OBJ_TYPE_NONE) {
+      type_count[type]++;
     }
   }
 
-  if (!nb_pu_levels) {
+  /* sanity checks */
+  if (!type_count[HWLOC_OBJ_PU]) {
     if (verbose)
       fprintf(stderr, "Synthetic string missing ending number of PUs\n");
     errno = EINVAL;
     return -1;
+  } else if (type_count[HWLOC_OBJ_PU] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several PU levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_PACKAGE] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several package levels\n");
+    errno = EINVAL;
+    return -1;
   }
-  if (nb_pu_levels > 1) {
+  if (type_count[HWLOC_OBJ_DIE] > 1) {
     if (verbose)
-      fprintf(stderr, "Synthetic string can not have several PU levels\n");
+      fprintf(stderr, "Synthetic string cannot have several die levels\n");
     errno = EINVAL;
     return -1;
   }
-  if (nb_node_levels > 1) {
+  if (type_count[HWLOC_OBJ_NUMANODE] > 1) {
     if (verbose)
-      fprintf(stderr, "Synthetic string can not have several NUMA node levels\n");
+      fprintf(stderr, "Synthetic string cannot have several NUMA node levels\n");
     errno = EINVAL;
     return -1;
   }
-  if (nb_machine_levels > 1) {
+  if (type_count[HWLOC_OBJ_NUMANODE] && data->numa_attached_nr) {
     if (verbose)
-      fprintf(stderr, "Synthetic string can not have several machine levels\n");
+      fprintf(stderr,"Synthetic string cannot have NUMA nodes both as a level and attached\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (type_count[HWLOC_OBJ_CORE] > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot have several core levels\n");
     errno = EINVAL;
     return -1;
   }
 
-  if (nb_machine_levels)
-    data->level[0].type = HWLOC_OBJ_SYSTEM;
-  else {
-    data->level[0].type = HWLOC_OBJ_MACHINE;
-    nb_machine_levels++;
+  /* deal with missing intermediate levels */
+  unset = 0;
+  for(i=1; i<count-1; i++) {
+    if (data->level[i].attr.type == HWLOC_OBJ_TYPE_NONE)
+      unset++;
+  }
+  if (unset && unset != count-2) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string cannot mix unspecified and specified types for levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (unset) {
+    /* we want in priority: numa, package, core, up to 3 caches, groups */
+    unsigned _count = count;
+    unsigned neednuma = 0;
+    unsigned needpack = 0;
+    unsigned needcore = 0;
+    unsigned needcaches = 0;
+    unsigned needgroups = 0;
+    /* 2 levels for machine and PU */
+    _count -= 2;
+
+    neednuma = (_count >= 1 && !data->numa_attached_nr);
+    _count -= neednuma;
+
+    needpack = (_count >= 1);
+    _count -= needpack;
+
+    needcore = (_count >= 1);
+    _count -= needcore;
+
+    needcaches = (_count > 4 ? 4 : _count);
+    _count -= needcaches;
+
+    needgroups = _count;
+
+    /* we place them in order: groups, package, numa, caches, core */
+    for(i = 0; i < needgroups; i++) {
+      unsigned depth = 1 + i;
+      data->level[depth].attr.type = HWLOC_OBJ_GROUP;
+      type_count[HWLOC_OBJ_GROUP]++;
+    }
+    if (needpack) {
+      unsigned depth = 1 + needgroups;
+      data->level[depth].attr.type = HWLOC_OBJ_PACKAGE;
+      type_count[HWLOC_OBJ_PACKAGE] = 1;
+    }
+    if (neednuma) {
+      unsigned depth = 1 + needgroups + needpack;
+      data->level[depth].attr.type = HWLOC_OBJ_NUMANODE;
+      type_count[HWLOC_OBJ_NUMANODE] = 1;
+    }
+    if (needcaches) {
+      /* priority: l2, l1, l3, l1i */
+      /* order: l3, l2, l1, l1i */
+      unsigned l3depth = 1 + needgroups + needpack + neednuma;
+      unsigned l2depth = l3depth + (needcaches >= 3);
+      unsigned l1depth = l2depth + 1;
+      unsigned l1idepth = l1depth + 1;
+      if (needcaches >= 3) {
+	data->level[l3depth].attr.type = HWLOC_OBJ_L3CACHE;
+	data->level[l3depth].attr.depth = 3;
+	data->level[l3depth].attr.cachetype = HWLOC_OBJ_CACHE_UNIFIED;
+	type_count[HWLOC_OBJ_L3CACHE] = 1;
+      }
+      data->level[l2depth].attr.type = HWLOC_OBJ_L2CACHE;
+      data->level[l2depth].attr.depth = 2;
+      data->level[l2depth].attr.cachetype = HWLOC_OBJ_CACHE_UNIFIED;
+      type_count[HWLOC_OBJ_L2CACHE] = 1;
+      if (needcaches >= 2) {
+	data->level[l1depth].attr.type = HWLOC_OBJ_L1CACHE;
+	data->level[l1depth].attr.depth = 1;
+	data->level[l1depth].attr.cachetype = HWLOC_OBJ_CACHE_DATA;
+	type_count[HWLOC_OBJ_L1CACHE] = 1;
+      }
+      if (needcaches >= 4) {
+	data->level[l1idepth].attr.type = HWLOC_OBJ_L1ICACHE;
+	data->level[l1idepth].attr.depth = 1;
+	data->level[l1idepth].attr.cachetype = HWLOC_OBJ_CACHE_INSTRUCTION;
+	type_count[HWLOC_OBJ_L1ICACHE] = 1;
+      }
+    }
+    if (needcore) {
+      unsigned depth = 1 + needgroups + needpack + neednuma + needcaches;
+      data->level[depth].attr.type = HWLOC_OBJ_CORE;
+      type_count[HWLOC_OBJ_CORE] = 1;
+    }
   }
 
   /* enforce a NUMA level */
-  if (!nb_node_levels) {
-    /* insert a NUMA level and the machine level */
-    if (data->level[1].type == HWLOC_OBJ_MACHINE)
-      /* there's an explicit machine level after the automatic system root, insert below both */
-      i = 2;
-    else
-      /* insert below the automatic machine root */
-      i = 1;
+  if (!type_count[HWLOC_OBJ_NUMANODE] && !data->numa_attached_nr) {
+    /* insert a NUMA level below the automatic machine root */
     if (verbose)
-      fprintf(stderr, "Inserting a NUMA level with a single object at depth %u\n", i);
+      fprintf(stderr, "Inserting a NUMA level with a single object at depth 1\n");
     /* move existing levels by one */
-    memmove(&data->level[i+1], &data->level[i], (count*i)*sizeof(struct hwloc_synthetic_level_data_s));
-    data->level[i].type = HWLOC_OBJ_NUMANODE;
-    data->level[i].index_string = NULL;
-    data->level[i].index_array = NULL;
-    data->level[i].memorysize = 0;
-    data->level[i].totalwidth = data->level[i-1].totalwidth;
+    memmove(&data->level[2], &data->level[1], count*sizeof(struct hwloc_synthetic_level_data_s));
+    data->level[1].attr.type = HWLOC_OBJ_NUMANODE;
+    data->level[1].indexes.string = NULL;
+    data->level[1].indexes.array = NULL;
+    data->level[1].attr.memorysize = 0;
+    data->level[1].totalwidth = data->level[0].totalwidth;
     /* update arity to insert a single NUMA node per parent */
-    data->level[i].arity = data->level[i-1].arity;
-    data->level[i-1].arity = 1;
+    data->level[1].arity = data->level[0].arity;
+    data->level[0].arity = 1;
     count++;
   }
 
-  if (cache_depth == 1)
-    /* if there is a single cache level, make it L2 */
-    cache_depth = 2;
-
   for (i=0; i<count; i++) {
     struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
-    hwloc_obj_type_t type = curlevel->type;
+    hwloc_obj_type_t type = curlevel->attr.type;
 
     if (type == HWLOC_OBJ_GROUP) {
-      if (curlevel->depth == (unsigned)-1)
-	curlevel->depth = group_depth--;
-
-    } else if (type == HWLOC_OBJ_CACHE) {
-      if (curlevel->depth == (unsigned)-1)
-	curlevel->depth = cache_depth--;
-      if (curlevel->cachetype == (hwloc_obj_cache_type_t) -1)
-	curlevel->cachetype = curlevel->depth == 1 ? HWLOC_OBJ_CACHE_DATA : HWLOC_OBJ_CACHE_UNIFIED;
-      if (!curlevel->memorysize) {
-	if (1 == curlevel->depth)
+      if (curlevel->attr.depth == (unsigned)-1)
+	curlevel->attr.depth = type_count[HWLOC_OBJ_GROUP]--;
+
+    } else if (hwloc__obj_type_is_cache(type)) {
+      if (!curlevel->attr.memorysize) {
+	if (1 == curlevel->attr.depth)
 	  /* 32Kb in L1 */
-	  curlevel->memorysize = 32*1024;
+	  curlevel->attr.memorysize = 32*1024;
 	else
 	  /* *4 at each level, starting from 1MB for L2, unified */
-	  curlevel->memorysize = 256*1024 << (2*curlevel->depth);
+	  curlevel->attr.memorysize = 256ULL*1024 << (2*curlevel->attr.depth);
       }
 
-    } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->memorysize) {
+    } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->attr.memorysize) {
       /* 1GB in memory nodes. */
-      curlevel->memorysize = 1024*1024*1024;
+      curlevel->attr.memorysize = 1024*1024*1024;
     }
 
-    hwloc_synthetic_process_level_indexes(data, i, verbose);
+    hwloc_synthetic_process_indexes(data, &data->level[i].indexes, data->level[i].totalwidth, verbose);
   }
 
+  hwloc_synthetic_process_indexes(data, &data->numa_attached_indexes, data->numa_attached_nr, verbose);
+
   data->string = strdup(description);
   data->level[count-1].arity = 0;
   return 0;
 
  error:
-  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
-    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
-    free(curlevel->index_array);
-    if (!curlevel->arity)
-      break;
-  }
+  hwloc_synthetic_free_levels(data);
   return -1;
 }
 
 static void
-hwloc_synthetic__post_look_hooks(struct hwloc_synthetic_level_data_s *curlevel,
-				 hwloc_obj_t obj)
+hwloc_synthetic_set_attr(struct hwloc_synthetic_attr_s *sattr,
+			 hwloc_obj_t obj)
 {
   switch (obj->type) {
   case HWLOC_OBJ_GROUP:
-    obj->attr->group.depth = curlevel->depth;
-    break;
-  case HWLOC_OBJ_SYSTEM:
+    obj->attr->group.kind = HWLOC_GROUP_KIND_SYNTHETIC;
+    obj->attr->group.subkind = sattr->depth-1;
     break;
   case HWLOC_OBJ_MACHINE:
     break;
   case HWLOC_OBJ_NUMANODE:
+    obj->attr->numanode.local_memory = sattr->memorysize;
+    obj->attr->numanode.page_types_len = 1;
+    obj->attr->numanode.page_types = malloc(sizeof(*obj->attr->numanode.page_types));
+    memset(obj->attr->numanode.page_types, 0, sizeof(*obj->attr->numanode.page_types));
+    obj->attr->numanode.page_types[0].size = 4096;
+    obj->attr->numanode.page_types[0].count = sattr->memorysize / 4096;
     break;
   case HWLOC_OBJ_PACKAGE:
+  case HWLOC_OBJ_DIE:
     break;
-  case HWLOC_OBJ_CACHE:
-    obj->attr->cache.depth = curlevel->depth;
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    obj->attr->cache.depth = sattr->depth;
     obj->attr->cache.linesize = 64;
-    obj->attr->cache.type = curlevel->cachetype;
-    obj->attr->cache.size = curlevel->memorysize;
+    obj->attr->cache.type = sattr->cachetype;
+    obj->attr->cache.size = sattr->memorysize;
     break;
   case HWLOC_OBJ_CORE:
     break;
   case HWLOC_OBJ_PU:
     break;
-  case HWLOC_OBJ_BRIDGE:
-  case HWLOC_OBJ_PCI_DEVICE:
-  case HWLOC_OBJ_OS_DEVICE:
-  case HWLOC_OBJ_MISC:
-  case HWLOC_OBJ_TYPE_MAX:
+  default:
     /* Should never happen */
     assert(0);
     break;
   }
-  if (curlevel->memorysize && HWLOC_OBJ_CACHE != obj->type) {
-    obj->memory.local_memory = curlevel->memorysize;
-    obj->memory.page_types_len = 1;
-    obj->memory.page_types = malloc(sizeof(*obj->memory.page_types));
-    memset(obj->memory.page_types, 0, sizeof(*obj->memory.page_types));
-    obj->memory.page_types[0].size = 4096;
-    obj->memory.page_types[0].count = curlevel->memorysize / 4096;
-  }
+}
+
+static unsigned
+hwloc_synthetic_next_index(struct hwloc_synthetic_indexes_s *indexes, hwloc_obj_type_t type)
+{
+  unsigned os_index = indexes->next++;
+
+  if (indexes->array)
+    os_index = indexes->array[os_index];
+  else if (hwloc__obj_type_is_cache(type) || type == HWLOC_OBJ_GROUP)
+    /* don't enforce useless os_indexes for Caches and Groups */
+    os_index = HWLOC_UNKNOWN_INDEX;
+
+  return os_index;
+}
+
+static void
+hwloc_synthetic_insert_attached(struct hwloc_topology *topology,
+				struct hwloc_synthetic_backend_data_s *data,
+				struct hwloc_synthetic_attached_s *attached,
+				hwloc_bitmap_t set)
+{
+  hwloc_obj_t child;
+  unsigned attached_os_index;
+
+  if (!attached)
+    return;
+
+  assert(attached->attr.type == HWLOC_OBJ_NUMANODE);
+
+  attached_os_index = hwloc_synthetic_next_index(&data->numa_attached_indexes, HWLOC_OBJ_NUMANODE);
+
+  child = hwloc_alloc_setup_object(topology, attached->attr.type, attached_os_index);
+  child->cpuset = hwloc_bitmap_dup(set);
+
+  child->nodeset = hwloc_bitmap_alloc();
+  hwloc_bitmap_set(child->nodeset, attached_os_index);
+
+  hwloc_synthetic_set_attr(&attached->attr, child);
+
+  hwloc_insert_object_by_cpuset(topology, child);
+
+  hwloc_synthetic_insert_attached(topology, data, attached->next, set);
 }
 
 /*
@@ -702,112 +934,102 @@ hwloc__look_synthetic(struct hwloc_topology *topology,
   hwloc_obj_t obj;
   unsigned i;
   struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
-  hwloc_obj_type_t type = curlevel->type;
+  hwloc_obj_type_t type = curlevel->attr.type;
+  hwloc_bitmap_t set;
   unsigned os_index;
 
-  /* pre-hooks */
-  switch (type) {
-    case HWLOC_OBJ_GROUP:
-      break;
-    case HWLOC_OBJ_MACHINE:
-      break;
-    case HWLOC_OBJ_NUMANODE:
-      break;
-    case HWLOC_OBJ_PACKAGE:
-      break;
-    case HWLOC_OBJ_CACHE:
-      break;
-    case HWLOC_OBJ_CORE:
-      break;
-    case HWLOC_OBJ_PU:
-      break;
-    case HWLOC_OBJ_SYSTEM:
-    case HWLOC_OBJ_BRIDGE:
-    case HWLOC_OBJ_PCI_DEVICE:
-    case HWLOC_OBJ_OS_DEVICE:
-    case HWLOC_OBJ_MISC:
-    case HWLOC_OBJ_TYPE_MAX:
-      /* Should never happen */
-      assert(0);
-      break;
-  }
+  assert(hwloc__obj_type_is_normal(type) || type == HWLOC_OBJ_NUMANODE);
+  assert(type != HWLOC_OBJ_MACHINE);
 
-  os_index = curlevel->next_os_index++;
-  if (curlevel->index_array)
-    os_index = curlevel->index_array[os_index];
-  obj = hwloc_alloc_setup_object(type, os_index);
-  obj->cpuset = hwloc_bitmap_alloc();
+  os_index = hwloc_synthetic_next_index(&curlevel->indexes, type);
 
+  set = hwloc_bitmap_alloc();
   if (!curlevel->arity) {
-    hwloc_bitmap_set(obj->cpuset, os_index);
+    hwloc_bitmap_set(set, os_index);
   } else {
     for (i = 0; i < curlevel->arity; i++)
-      hwloc__look_synthetic(topology, data, level + 1, obj->cpuset);
+      hwloc__look_synthetic(topology, data, level + 1, set);
   }
 
-  if (type == HWLOC_OBJ_NUMANODE) {
-    obj->nodeset = hwloc_bitmap_alloc();
-    hwloc_bitmap_set(obj->nodeset, os_index);
-  }
+  hwloc_bitmap_or(parent_cpuset, parent_cpuset, set);
 
-  hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);
+  if (hwloc_filter_check_keep_object_type(topology, type)) {
+    obj = hwloc_alloc_setup_object(topology, type, os_index);
+    obj->cpuset = hwloc_bitmap_dup(set);
 
-  hwloc_synthetic__post_look_hooks(curlevel, obj);
+    if (type == HWLOC_OBJ_NUMANODE) {
+      obj->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(obj->nodeset, os_index);
+    }
 
-  hwloc_insert_object_by_cpuset(topology, obj);
+    hwloc_synthetic_set_attr(&curlevel->attr, obj);
+
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+
+  hwloc_synthetic_insert_attached(topology, data, curlevel->attached, set);
+
+  hwloc_bitmap_free(set);
 }
 
 static int
-hwloc_look_synthetic(struct hwloc_backend *backend)
+hwloc_look_synthetic(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
 {
+  /*
+   * This backend enforces !topology->is_thissystem by default.
+   */
+
   struct hwloc_topology *topology = backend->topology;
   struct hwloc_synthetic_backend_data_s *data = backend->private_data;
   hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
   unsigned i;
 
+  assert(dstatus->phase == HWLOC_DISC_PHASE_GLOBAL);
+
   assert(!topology->levels[0][0]->cpuset);
 
-  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+  hwloc_alloc_root_sets(topology->levels[0][0]);
 
   topology->support.discovery->pu = 1;
+  topology->support.discovery->numa = 1; /* we add a single NUMA node if none is given */
+  topology->support.discovery->numa_memory = 1; /* specified or default size */
 
   /* start with os_index 0 for each level */
   for (i = 0; data->level[i].arity > 0; i++)
-    data->level[i].next_os_index = 0;
+    data->level[i].indexes.next = 0;
+  data->numa_attached_indexes.next = 0;
   /* ... including the last one */
-  data->level[i].next_os_index = 0;
+  data->level[i].indexes.next = 0;
 
   /* update first level type according to the synthetic type array */
-  topology->levels[0][0]->type = data->level[0].type;
-  hwloc_synthetic__post_look_hooks(&data->level[0], topology->levels[0][0]);
+  topology->levels[0][0]->type = data->level[0].attr.type;
+  hwloc_synthetic_set_attr(&data->level[0].attr, topology->levels[0][0]);
 
   for (i = 0; i < data->level[0].arity; i++)
     hwloc__look_synthetic(topology, data, 1, cpuset);
 
+  hwloc_synthetic_insert_attached(topology, data, data->level[0].attached, cpuset);
+
   hwloc_bitmap_free(cpuset);
 
   hwloc_obj_add_info(topology->levels[0][0], "Backend", "Synthetic");
   hwloc_obj_add_info(topology->levels[0][0], "SyntheticDescription", data->string);
-  return 1;
+  return 0;
 }
 
 static void
 hwloc_synthetic_backend_disable(struct hwloc_backend *backend)
 {
   struct hwloc_synthetic_backend_data_s *data = backend->private_data;
-  unsigned i;
-  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
-    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
-    free(curlevel->index_array);
-    if (!curlevel->arity)
-      break;
-  }
+  hwloc_synthetic_free_levels(data);
   free(data->string);
   free(data);
 }
 
 static struct hwloc_backend *
-hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
+hwloc_synthetic_component_instantiate(struct hwloc_topology *topology,
+				      struct hwloc_disc_component *component,
+				      unsigned excluded_phases __hwloc_attribute_unused,
 				      const void *_data1,
 				      const void *_data2 __hwloc_attribute_unused,
 				      const void *_data3 __hwloc_attribute_unused)
@@ -827,7 +1049,7 @@ hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
     }
   }
 
-  backend = hwloc_backend_alloc(component);
+  backend = hwloc_backend_alloc(topology, component);
   if (!backend)
     goto out;
 
@@ -857,11 +1079,12 @@ hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
 }
 
 static struct hwloc_disc_component hwloc_synthetic_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
   "synthetic",
+  HWLOC_DISC_PHASE_GLOBAL,
   ~0,
   hwloc_synthetic_component_instantiate,
   30,
+  1,
   NULL
 };
 
@@ -873,15 +1096,38 @@ const struct hwloc_component hwloc_synthetic_component = {
   &hwloc_synthetic_disc_component
 };
 
-static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topology,
-						   hwloc_obj_t obj,
-						   char *buffer, size_t buflen)
+static __hwloc_inline int
+hwloc__export_synthetic_update_status(int *ret, char **tmp, ssize_t *tmplen, int res)
+{
+  if (res < 0)
+    return -1;
+  *ret += res;
+  if (res >= *tmplen)
+    res = *tmplen>0 ? (int)(*tmplen) - 1 : 0;
+  *tmp += res;
+  *tmplen -= res;
+  return 0;
+}
+
+static __hwloc_inline void
+hwloc__export_synthetic_add_char(int *ret, char **tmp, ssize_t *tmplen, char c)
+{
+  if (*tmplen > 1) {
+    (*tmp)[0] = c;
+    (*tmp)[1] = '\0';
+    (*tmp)++;
+    (*tmplen)--;
+  }
+  (*ret)++;
+}
+
+static int
+hwloc__export_synthetic_indexes(hwloc_obj_t *level, unsigned total,
+				char *buffer, size_t buflen)
 {
-  unsigned depth = obj->depth;
-  unsigned total = topology->level_nbobjects[depth];
   unsigned step = 1;
   unsigned nr_loops = 0;
-  struct hwloc_synthetic_intlv_loop_s *loops = NULL;
+  struct hwloc_synthetic_intlv_loop_s *loops = NULL, *tmploops;
   hwloc_obj_t cur;
   unsigned i, j;
   ssize_t tmplen = buflen;
@@ -889,7 +1135,7 @@ static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topol
   int res, ret = 0;
 
   /* must start with 0 */
-  if (obj->os_index)
+  if (level[0]->os_index)
     goto exportall;
 
   while (step != total) {
@@ -899,18 +1145,19 @@ static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topol
 
     /* look for os_index == step */
     for(i=1; i<total; i++)
-      if (topology->levels[depth][i]->os_index == step)
+      if (level[i]->os_index == step)
 	break;
     if (i == total)
       goto exportall;
     for(j=2; j<total/i; j++)
-      if (topology->levels[depth][i*j]->os_index != step*j)
+      if (level[i*j]->os_index != step*j)
 	break;
 
     nr_loops++;
-    loops = realloc(loops, nr_loops*sizeof(*loops));
-    if (!loops)
+    tmploops = realloc(loops, nr_loops*sizeof(*loops));
+    if (!tmploops)
       goto exportall;
+    loops = tmploops;
     loops[nr_loops-1].step = i;
     loops[nr_loops-1].nb = j;
     step *= j;
@@ -924,7 +1171,7 @@ static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topol
       ind += (i / loops[j].step) % loops[j].nb * mul;
       mul *= loops[j].nb;
     }
-    if (topology->levels[depth][i]->os_index != ind)
+    if (level[i]->os_index != ind)
       goto exportall;
   }
 
@@ -932,46 +1179,34 @@ static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topol
   for(j=0; j<nr_loops; j++) {
     res = hwloc_snprintf(tmp, tmplen, "%u*%u%s", loops[j].step, loops[j].nb,
 			 j == nr_loops-1 ? ")" : ":");
-    if (res < 0) {
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0) {
       free(loops);
       return -1;
     }
-    ret += res;
-    if (res >= tmplen)
-      res = tmplen>0 ? tmplen - 1 : 0;
-    tmp += res;
-    tmplen -= res;
   }
 
-  if (loops)
-    free(loops);
-
+  free(loops);
   return ret;
 
  exportall:
-  if (loops)
-    free(loops);
+  free(loops);
 
   /* dump all indexes */
-  cur = obj;
+  cur = level[0];
   while (cur) {
-    res = snprintf(tmp, tmplen, "%u%s", cur->os_index,
-		   cur->next_cousin ? "," : ")");
-    if (res < 0)
+    res = hwloc_snprintf(tmp, tmplen, "%u%s", cur->os_index,
+			 cur->next_cousin ? "," : ")");
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
       return -1;
-    ret += res;
-    if (res >= tmplen)
-      res = tmplen>0 ? tmplen - 1 : 0;
-    tmp += res;
-    tmplen -= res;
     cur = cur->next_cousin;
   }
   return ret;
 }
 
-static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topology,
-						    hwloc_obj_t obj,
-						    char *buffer, size_t buflen)
+static int
+hwloc__export_synthetic_obj_attr(struct hwloc_topology * topology,
+				 hwloc_obj_t obj,
+				 char *buffer, size_t buflen)
 {
   const char * separator = " ";
   const char * prefix = "(";
@@ -979,17 +1214,18 @@ static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topo
   char memsize[64] = "";
   int needindexes = 0;
 
-  if (HWLOC_OBJ_CACHE == obj->type && obj->attr->cache.size) {
+  if (hwloc__obj_type_is_cache(obj->type) && obj->attr->cache.size) {
     snprintf(cachesize, sizeof(cachesize), "%ssize=%llu",
 	     prefix, (unsigned long long) obj->attr->cache.size);
     prefix = separator;
   }
-  if (obj->memory.local_memory) {
+  if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory) {
     snprintf(memsize, sizeof(memsize), "%smemory=%llu",
-	     prefix, (unsigned long long) obj->memory.local_memory);
+	     prefix, (unsigned long long) obj->attr->numanode.local_memory);
     prefix = separator;
   }
-  if (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE) {
+  if (!obj->logical_index /* only display indexes once per level (not for non-first NUMA children, etc.) */
+      && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)) {
     hwloc_obj_t cur = obj;
     while (cur) {
       if (cur->os_index != cur->logical_index) {
@@ -1005,32 +1241,29 @@ static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topo
     int res, ret = 0;
 
     res = hwloc_snprintf(tmp, tmplen, "%s%s%s", cachesize, memsize, needindexes ? "" : ")");
-    if (res < 0)
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
       return -1;
-    ret += res;
-    if (res >= tmplen)
-      res = tmplen>0 ? tmplen - 1 : 0;
-    tmp += res;
-    tmplen -= res;
 
     if (needindexes) {
-      res = snprintf(tmp, tmplen, "%sindexes=", prefix);
-      if (res < 0)
+      unsigned total;
+      hwloc_obj_t *level;
+
+      if (obj->depth < 0) {
+	assert(obj->depth == HWLOC_TYPE_DEPTH_NUMANODE);
+	total = topology->slevels[HWLOC_SLEVEL_NUMANODE].nbobjs;
+	level = topology->slevels[HWLOC_SLEVEL_NUMANODE].objs;
+      } else {
+	total = topology->level_nbobjects[obj->depth];
+	level = topology->levels[obj->depth];
+      }
+
+      res = hwloc_snprintf(tmp, tmplen, "%sindexes=", prefix);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
 	return -1;
-      ret += res;
-      if (res >= tmplen)
-	res = tmplen>0 ? tmplen - 1 : 0;
-      tmp += res;
-      tmplen -= res;
-
-      res = hwloc_topology_export_synthetic_indexes(topology, obj, tmp, tmplen);
-      if (res < 0)
+
+      res = hwloc__export_synthetic_indexes(level, total, tmp, tmplen);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
 	return -1;
-      ret += res;
-      if (res >= tmplen)
-	res = tmplen>0 ? tmplen - 1 : 0;
-      tmp += res;
-      tmplen -= res;
     }
     return ret;
   } else {
@@ -1038,6 +1271,174 @@ static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topo
   }
 }
 
+static int
+hwloc__export_synthetic_obj(struct hwloc_topology * topology, unsigned long flags,
+			    hwloc_obj_t obj, unsigned arity,
+			    char *buffer, size_t buflen)
+{
+  char aritys[12] = "";
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  /* <type>:<arity>, except for root */
+  if (arity != (unsigned)-1)
+    snprintf(aritys, sizeof(aritys), ":%u", arity);
+  if (hwloc__obj_type_is_cache(obj->type)
+      && (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)) {
+    /* v1 uses generic "Cache" for non-extended type name */
+    res = hwloc_snprintf(tmp, tmplen, "Cache%s", aritys);
+
+  } else if (obj->type == HWLOC_OBJ_PACKAGE
+	     && (flags & (HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES
+			  |HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1))) {
+    /* if exporting to v1 or without extended-types, use all-v1-compatible Socket name */
+    res = hwloc_snprintf(tmp, tmplen, "Socket%s", aritys);
+
+  } else if (obj->type == HWLOC_OBJ_DIE
+	     && (flags & (HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES
+			  |HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1))) {
+    /* if exporting to v1 or without extended-types, use all-v1-compatible Group name */
+    res = hwloc_snprintf(tmp, tmplen, "Group%s", aritys);
+
+  } else if (obj->type == HWLOC_OBJ_GROUP /* don't export group depth */
+      || flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
+    res = hwloc_snprintf(tmp, tmplen, "%s%s", hwloc_obj_type_string(obj->type), aritys);
+  } else {
+    char types[64];
+    hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
+    res = hwloc_snprintf(tmp, tmplen, "%s%s", types, aritys);
+  }
+  if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+    return -1;
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    /* obj attributes */
+    res = hwloc__export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+  }
+
+  return ret;
+}
+
+static int
+hwloc__export_synthetic_memory_children(struct hwloc_topology * topology, unsigned long flags,
+					hwloc_obj_t parent,
+					char *buffer, size_t buflen,
+					int needprefix, int verbose)
+{
+  hwloc_obj_t mchild;
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  mchild = parent->memory_first_child;
+  if (!mchild)
+    return 0;
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1) {
+    /* v1: export a single NUMA child */
+    if (parent->memory_arity > 1 || mchild->type != HWLOC_OBJ_NUMANODE) {
+      /* not supported */
+      if (verbose)
+	fprintf(stderr, "Cannot export to synthetic v1 if multiple memory children are attached to the same location.\n");
+      errno = EINVAL;
+      return -1;
+    }
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    res = hwloc__export_synthetic_obj(topology, flags, mchild, 1, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+    return ret;
+  }
+
+  while (mchild) {
+    /* FIXME: really recurse to export memcaches and numanode,
+     * but it requires clever parsing of [ memcache [numa] [numa] ] during import,
+     * better attaching of things to describe the hierarchy.
+     */
+    hwloc_obj_t numanode = mchild;
+    /* only export the first NUMA node leaf of each memory child
+     * FIXME: This assumes mscache aren't shared between nodes, that's true in current platforms
+     */
+    while (numanode && numanode->type != HWLOC_OBJ_NUMANODE) {
+      assert(numanode->arity == 1);
+      numanode = numanode->memory_first_child;
+    }
+    assert(numanode); /* there's always a numanode at the bottom of the memory tree */
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, '[');
+
+    res = hwloc__export_synthetic_obj(topology, flags, numanode, (unsigned)-1, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+
+    hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ']');
+
+    needprefix = 1;
+    mchild = mchild->next_sibling;
+  }
+
+  return ret;
+}
+
+static int
+hwloc_check_memory_symmetric(struct hwloc_topology * topology)
+{
+  hwloc_bitmap_t remaining_nodes;
+
+  remaining_nodes = hwloc_bitmap_dup(hwloc_get_root_obj(topology)->nodeset);
+  if (!remaining_nodes)
+    /* assume asymmetric */
+    return -1;
+
+  while (!hwloc_bitmap_iszero(remaining_nodes)) {
+    unsigned idx;
+    hwloc_obj_t node;
+    hwloc_obj_t first_parent;
+    unsigned i;
+
+    idx = hwloc_bitmap_first(remaining_nodes);
+    node = hwloc_get_numanode_obj_by_os_index(topology, idx);
+    assert(node);
+
+    first_parent = node->parent;
+
+    /* check whether all object on parent's level have same number of NUMA bits */
+    for(i=0; i<hwloc_get_nbobjs_by_depth(topology, first_parent->depth); i++) {
+      hwloc_obj_t parent, mchild;
+
+      parent = hwloc_get_obj_by_depth(topology, first_parent->depth, i);
+      assert(parent);
+
+      /* must have same memory arity */
+      if (parent->memory_arity != first_parent->memory_arity)
+	goto out_with_bitmap;
+
+      /* clear children NUMA bits from remaining_nodes */
+      mchild = parent->memory_first_child;
+      while (mchild) {
+	hwloc_bitmap_clr(remaining_nodes, mchild->os_index); /* cannot use parent->nodeset, some normal children may have other NUMA nodes */
+	mchild = mchild->next_sibling;
+      }
+    }
+  }
+
+  hwloc_bitmap_free(remaining_nodes);
+  return 0;
+
+ out_with_bitmap:
+  hwloc_bitmap_free(remaining_nodes);
+  return -1;
+}
+
 int
 hwloc_topology_export_synthetic(struct hwloc_topology * topology,
 				char *buffer, size_t buflen,
@@ -1047,11 +1448,23 @@ hwloc_topology_export_synthetic(struct hwloc_topology * topology,
   ssize_t tmplen = buflen;
   char *tmp = buffer;
   int res, ret = 0;
-   int arity;
-  const char * separator = " ";
-  const char * prefix = "";
+  unsigned arity;
+  int needprefix = 0;
+  int verbose = 0;
+  const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
 
-  if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+  if (env)
+    verbose = atoi(env);
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1
+		|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
     errno = EINVAL;
     return -1;
   }
@@ -1070,57 +1483,79 @@ hwloc_topology_export_synthetic(struct hwloc_topology * topology,
   /* TODO: flag to force all indexes, not only for PU and NUMA? */
 
   if (!obj->symmetric_subtree) {
+    if (verbose)
+      fprintf(stderr, "Cannot export to synthetic unless topology is symmetric (root->symmetric_subtree must be set).\n");
     errno = EINVAL;
     return -1;
   }
 
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)
+      && hwloc_check_memory_symmetric(topology) < 0) {
+    if (verbose)
+      fprintf(stderr, "Cannot export to synthetic unless memory is attached symmetrically.\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1) {
+    /* v1 requires all NUMA at the same level */
+    hwloc_obj_t node;
+    signed pdepth;
+
+    node = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, 0);
+    assert(hwloc__obj_type_is_normal(node->parent->type)); /* only depth-1 memory children for now */
+    pdepth = node->parent->depth;
+
+    while ((node = node->next_cousin) != NULL) {
+      assert(hwloc__obj_type_is_normal(node->parent->type)); /* only depth-1 memory children for now */
+      if (node->parent->depth != pdepth) {
+	if (verbose)
+	  fprintf(stderr, "Cannot export to synthetic v1 if memory is attached to parents at different depths.\n");
+	errno = EINVAL;
+	return -1;
+      }
+    }
+  }
+
+  /* we're good, start exporting */
+
   if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
-    /* root attributes */
-    res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
-    if (res < 0)
+    /* obj attributes */
+    res = hwloc__export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (res > 0)
+      needprefix = 1;
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
+      return -1;
+  }
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
+    res = hwloc__export_synthetic_memory_children(topology, flags, obj, tmp, tmplen, needprefix, verbose);
+    if (res > 0)
+      needprefix = 1;
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
       return -1;
-    ret += res;
-    if (ret > 0)
-      prefix = separator;
-    if (res >= tmplen)
-      res = tmplen>0 ? tmplen - 1 : 0;
-    tmp += res;
-    tmplen -= res;
   }
 
   arity = obj->arity;
   while (arity) {
     /* for each level */
     obj = obj->first_child;
-    if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
-      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, hwloc_obj_type_string(obj->type), arity);
-    } else {
-      char types[64];
-      hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
-      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, types, arity);
-    }
-    if (res < 0)
+
+    if (needprefix)
+      hwloc__export_synthetic_add_char(&ret, &tmp, &tmplen, ' ');
+
+    res = hwloc__export_synthetic_obj(topology, flags, obj, arity, tmp, tmplen);
+    if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
       return -1;
-    ret += res;
-    if (res >= tmplen)
-      res = tmplen>0 ? tmplen - 1 : 0;
-    tmp += res;
-    tmplen -= res;
-
-    if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
-      /* obj attributes */
-      res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
-      if (res < 0)
+
+    if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)) {
+      res = hwloc__export_synthetic_memory_children(topology, flags, obj, tmp, tmplen, 1, verbose);
+      if (hwloc__export_synthetic_update_status(&ret, &tmp, &tmplen, res) < 0)
 	return -1;
-      ret += res;
-      if (res >= tmplen)
-	res = tmplen>0 ? tmplen - 1 : 0;
-      tmp += res;
-      tmplen -= res;
     }
 
     /* next level */
-    prefix = separator;
+    needprefix = 1;
     arity = obj->arity;
   }
 
diff --git a/ext/hwloc/hwloc/topology-x86.c b/ext/hwloc/hwloc/topology-x86.c
index 071b01744..1060157de 100644
--- a/ext/hwloc/hwloc/topology-x86.c
+++ b/ext/hwloc/hwloc/topology-x86.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2019 Inria.  All rights reserved.
  * Copyright © 2010-2013 Université Bordeaux
  * Copyright © 2010-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -14,22 +14,27 @@
  * on various architectures, without having to use this x86-specific code.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-#include <private/misc.h>
-
-#include <private/cpuid-x86.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/debug.h"
+#include "private/misc.h"
+#include "private/cpuid-x86.h"
 
 #include <sys/types.h>
+#ifdef HAVE_DIRENT_H
 #include <dirent.h>
+#endif
+#ifdef HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
 
 struct hwloc_x86_backend_data_s {
   unsigned nbprocs;
   hwloc_bitmap_t apicid_set;
   int apicid_unique;
   char *src_cpuiddump_path;
+  int is_knl;
 };
 
 /************************************
@@ -64,29 +69,37 @@ cpuiddump_read(const char *dirpath, unsigned idx)
 {
   struct cpuiddump *cpuiddump;
   struct cpuiddump_entry *cur;
+  size_t filenamelen;
   char *filename;
-  size_t filenamelen = strlen(dirpath) + 15;
   FILE *file;
   char line[128];
   unsigned nr;
 
   cpuiddump = malloc(sizeof(*cpuiddump));
-  cpuiddump->nr = 0; /* return a cpuiddump that will raise errors because it matches nothing */
+  if (!cpuiddump) {
+    fprintf(stderr, "Failed to allocate cpuiddump for PU #%u, ignoring cpuiddump.\n", idx);
+    goto out;
+  }
 
+  filenamelen = strlen(dirpath) + 15;
   filename = malloc(filenamelen);
+  if (!filename)
+    goto out_with_dump;
   snprintf(filename, filenamelen, "%s/pu%u", dirpath, idx);
   file = fopen(filename, "r");
   if (!file) {
-    fprintf(stderr, "Could not read dumped cpuid file %s\n", filename);
-    free(filename);
-    return cpuiddump;
+    fprintf(stderr, "Could not read dumped cpuid file %s, ignoring cpuiddump.\n", filename);
+    goto out_with_filename;
   }
-  free(filename);
 
   nr = 0;
   while (fgets(line, sizeof(line), file))
     nr++;
   cpuiddump->entries = malloc(nr * sizeof(struct cpuiddump_entry));
+  if (!cpuiddump->entries) {
+    fprintf(stderr, "Failed to allocate %u cpuiddump entries for PU #%u, ignoring cpuiddump.\n", nr, idx);
+    goto out_with_file;
+  }
 
   fseek(file, 0, SEEK_SET);
   cur = &cpuiddump->entries[0];
@@ -102,9 +115,20 @@ cpuiddump_read(const char *dirpath, unsigned idx)
       nr++;
     }
   }
+
   cpuiddump->nr = nr;
   fclose(file);
+  free(filename);
   return cpuiddump;
+
+ out_with_file:
+  fclose(file);
+ out_with_filename:
+  free(filename);
+ out_with_dump:
+  free(cpuiddump);
+ out:
+  return NULL;
 }
 
 static void
@@ -150,35 +174,40 @@ static void cpuid_or_from_dump(unsigned *eax, unsigned *ebx, unsigned *ecx, unsi
  * Core detection routines and structures
  */
 
+enum hwloc_x86_disc_flags {
+  HWLOC_X86_DISC_FLAG_FULL = (1<<0), /* discover everything instead of only annotating */
+  HWLOC_X86_DISC_FLAG_TOPOEXT_NUMANODES = (1<<1) /* use AMD topoext numanode information */
+};
+
 #define has_topoext(features) ((features)[6] & (1 << 22))
 #define has_x2apic(features) ((features)[4] & (1 << 21))
 
 struct cacheinfo {
-  unsigned type;
+  hwloc_obj_cache_type_t type;
   unsigned level;
   unsigned nbthreads_sharing;
+  unsigned cacheid;
 
   unsigned linesize;
   unsigned linepart;
+  int inclusive;
   int ways;
   unsigned sets;
   unsigned long size;
-  char inclusiveness;
-
 };
 
 struct procinfo {
   unsigned present;
   unsigned apicid;
-  unsigned max_log_proc;
-  unsigned max_nbcores;
-  unsigned max_nbthreads;
-  unsigned packageid;
-  unsigned nodeid;
-  unsigned unitid;
-  unsigned logprocid;
-  unsigned threadid;
-  unsigned coreid;
+#define PKG 0
+#define CORE 1
+#define NODE 2
+#define UNIT 3
+#define TILE 4
+#define MODULE 5
+#define DIE 6
+#define HWLOC_X86_PROCINFO_ID_NR 7
+  unsigned ids[HWLOC_X86_PROCINFO_ID_NR];
   unsigned *otherids;
   unsigned levels;
   unsigned numcaches;
@@ -193,12 +222,15 @@ struct procinfo {
 enum cpuid_type {
   intel,
   amd,
+  zhaoxin,
+  hygon,
   unknown
 };
 
-static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, unsigned cpuid)
+/* AMD legacy cache information from specific CPUID 0x80000005-6 leaves */
+static void setup__amd_cache_legacy(struct procinfo *infos, unsigned level, hwloc_obj_cache_type_t type, unsigned nbthreads_sharing, unsigned cpuid)
 {
-  struct cacheinfo *cache;
+  struct cacheinfo *cache, *tmpcaches;
   unsigned cachenum;
   unsigned long size = 0;
 
@@ -211,28 +243,28 @@ static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, uns
   if (!size)
     return;
 
+  tmpcaches = realloc(infos->cache, (infos->numcaches+1)*sizeof(*infos->cache));
+  if (!tmpcaches)
+    /* failed to allocated, ignore that cache */
+    return;
+  infos->cache = tmpcaches;
   cachenum = infos->numcaches++;
-  infos->cache = realloc(infos->cache, infos->numcaches*sizeof(*infos->cache));
+
   cache = &infos->cache[cachenum];
 
   cache->type = type;
   cache->level = level;
-  if (level <= 2)
-    cache->nbthreads_sharing = 1;
-  else
-    cache->nbthreads_sharing = infos->max_log_proc;
+  cache->nbthreads_sharing = nbthreads_sharing;
   cache->linesize = cpuid & 0xff;
   cache->linepart = 0;
-  if (level == 1) {
-    cache->inclusiveness = 0;//get inclusiveness old AMD ( suposed to be L1 false)
+  cache->inclusive = 0; /* old AMD (K8-K10) supposed to have exclusive caches */
 
+  if (level == 1) {
     cache->ways = (cpuid >> 16) & 0xff;
     if (cache->ways == 0xff)
       /* Fully associative */
       cache->ways = -1;
   } else {
-    cache->inclusiveness = 1;//get inclusivenessold AMD ( suposed to be L2 L3 true)
-
     static const unsigned ways_tab[] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, -1 };
     unsigned ways = (cpuid >> 12) & 0xf;
     cache->ways = ways_tab[ways];
@@ -240,162 +272,66 @@ static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, uns
   cache->size = size;
   cache->sets = 0;
 
-  hwloc_debug("cache L%u t%u linesize %u ways %u size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
+  hwloc_debug("cache L%u t%u linesize %u ways %d size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
 }
 
-/* Fetch information from the processor itself thanks to cpuid and store it in
- * infos for summarize to analyze them globally */
-static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+/* AMD legacy cache information from CPUID 0x80000005-6 leaves */
+static void read_amd_caches_legacy(struct procinfo *infos, struct cpuiddump *src_cpuiddump, unsigned legacy_max_log_proc)
 {
-  struct hwloc_x86_backend_data_s *data = backend->private_data;
-  unsigned eax, ebx, ecx = 0, edx;
-  unsigned cachenum;
-  struct cacheinfo *cache;
-  unsigned regs[4];
-  unsigned _model, _extendedmodel, _family, _extendedfamily;
+  unsigned eax, ebx, ecx, edx;
 
-  infos->present = 1;
-
-  /* on return from this function, the following fields must be set in infos:
-   * packageid, nodeid, unitid, coreid, threadid, or -1
-   * apicid
-   * levels and levels slots in otherids[]
-   * numcaches and numcaches slots in caches[]
-   *
-   * max_log_proc, max_nbthreads, max_nbcores, logprocid
-   * are only used temporarily inside this function and its callees.
-   */
-
-  /* Get apicid, max_log_proc, packageid, logprocid from cpuid 0x01 */
-  eax = 0x01;
+  eax = 0x80000005;
   cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-  infos->apicid = ebx >> 24;
-  if (edx & (1 << 28))
-    infos->max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
-  else
-    infos->max_log_proc = 1;
-  hwloc_debug("APIC ID 0x%02x max_log_proc %u\n", infos->apicid, infos->max_log_proc);
-  infos->packageid = infos->apicid / infos->max_log_proc;
-  infos->logprocid = infos->apicid % infos->max_log_proc;
-  hwloc_debug("phys %u thread %u\n", infos->packageid, infos->logprocid);
-
-  /* Get cpu model/family/stepping numbers from same cpuid */
-  _model          = (eax>>4) & 0xf;
-  _extendedmodel  = (eax>>16) & 0xf;
-  _family         = (eax>>8) & 0xf;
-  _extendedfamily = (eax>>20) & 0xff;
-  if ((cpuid_type == intel || cpuid_type == amd) && _family == 0xf) {
-    infos->cpufamilynumber = _family + _extendedfamily;
-  } else {
-    infos->cpufamilynumber = _family;
-  }
-  if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
-      || (cpuid_type == amd && _family == 0xf)) {
-    infos->cpumodelnumber = _model + (_extendedmodel << 4);
-  } else {
-    infos->cpumodelnumber = _model;
-  }
-  infos->cpustepping = eax & 0xf;
-
-  /* Get cpu vendor string from cpuid 0x00 */
-  memset(regs, 0, sizeof(regs));
-  regs[0] = 0;
-  cpuid_or_from_dump(&regs[0], &regs[1], &regs[3], &regs[2], src_cpuiddump);
-  memcpy(infos->cpuvendor, regs+1, 4*3);
-  /* infos was calloc'ed, already ends with \0 */
+  setup__amd_cache_legacy(infos, 1, HWLOC_OBJ_CACHE_DATA, 1, ecx); /* private L1d */
+  setup__amd_cache_legacy(infos, 1, HWLOC_OBJ_CACHE_INSTRUCTION, 1, edx); /* private L1i */
 
-  /* Get cpu model string from cpuid 0x80000002-4 */
-  if (highest_ext_cpuid >= 0x80000004) {
-    memset(regs, 0, sizeof(regs));
-    regs[0] = 0x80000002;
-    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
-    memcpy(infos->cpumodel, regs, 4*4);
-    regs[0] = 0x80000003;
-    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
-    memcpy(infos->cpumodel + 4*4, regs, 4*4);
-    regs[0] = 0x80000004;
-    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
-    memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
-    /* infos was calloc'ed, already ends with \0 */
-  }
-
-  /* Get core/thread information from cpuid 0x80000008
-   * (not supported on Intel)
-   */
-  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000008) {
-    unsigned coreidsize;
-    eax = 0x80000008;
-    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-    coreidsize = (ecx >> 12) & 0xf;
-    hwloc_debug("core ID size: %u\n", coreidsize);
-    if (!coreidsize) {
-      infos->max_nbcores = (ecx & 0xff) + 1;
-    } else
-      infos->max_nbcores = 1 << coreidsize;
-    hwloc_debug("Thus max # of cores: %u\n", infos->max_nbcores);
-    /* Still no multithreaded AMD */
-    infos->max_nbthreads = 1 ;
-    hwloc_debug("and max # of threads: %u\n", infos->max_nbthreads);
-    /* The legacy max_log_proc is deprecated, it can be smaller than max_nbcores,
-     * which is the maximum number of cores that the processor could theoretically support
-     * (see "Multiple Core Calculation" in the AMD CPUID specification).
-     * Recompute packageid/logprocid/threadid/coreid accordingly.
+  eax = 0x80000006;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  if (ecx & 0xf000)
+    /* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
+     * Could be useful if some Intels (at least before Core micro-architecture)
+     * support this leaf without leaf 0x4.
      */
-    infos->packageid = infos->apicid / infos->max_nbcores;
-    infos->logprocid = infos->apicid % infos->max_nbcores;
-    infos->threadid = infos->logprocid % infos->max_nbthreads;
-    infos->coreid = infos->logprocid / infos->max_nbthreads;
-    hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
-  }
+    setup__amd_cache_legacy(infos, 2, HWLOC_OBJ_CACHE_UNIFIED, 1, ecx); /* private L2u */
+  if (edx & 0xf000)
+    setup__amd_cache_legacy(infos, 3, HWLOC_OBJ_CACHE_UNIFIED, legacy_max_log_proc, edx); /* package-wide L3u */
+}
 
-  infos->numcaches = 0;
-  infos->cache = NULL;
+/* AMD caches from CPUID 0x8000001d leaf (topoext) */
+static void read_amd_caches_topoext(struct procinfo *infos, struct cpuiddump *src_cpuiddump)
+{
+  unsigned eax, ebx, ecx, edx;
+  unsigned cachenum;
+  struct cacheinfo *cache;
 
-  /* Get apicid, nodeid, unitid from cpuid 0x8000001e
-   * and cache information from cpuid 0x8000001d
-   * (AMD topology extension)
-   */
-  if (cpuid_type != intel && has_topoext(features)) {
-    unsigned apic_id, node_id, nodes_per_proc, unit_id, cores_per_unit;
+  /* the code below doesn't want any other cache yet */
+  assert(!infos->numcaches);
 
-    eax = 0x8000001e;
+  for (cachenum = 0; ; cachenum++) {
+    eax = 0x8000001d;
+    ecx = cachenum;
     cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-    infos->apicid = apic_id = eax;
-    infos->nodeid = node_id = ecx & 0xff;
-    nodes_per_proc = ((ecx >> 8) & 7) + 1;
-    if (nodes_per_proc > 2) {
-      hwloc_debug("warning: undefined value %d, assuming it means %d\n", nodes_per_proc, nodes_per_proc);
-    }
-    infos->unitid = unit_id = ebx & 0xff;
-    cores_per_unit = ((ebx >> 8) & 3) + 1;
-    hwloc_debug("x2APIC %08x, %d nodes, node %d, %d cores in unit %d\n", apic_id, nodes_per_proc, node_id, cores_per_unit, unit_id);
-
-    for (cachenum = 0; ; cachenum++) {
-      unsigned type;
-      eax = 0x8000001d;
-      ecx = cachenum;
-      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-      type = eax & 0x1f;
-      if (type == 0)
-	break;
-      infos->numcaches++;
-    }
-
-    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+    if ((eax & 0x1f) == 0)
+      break;
+    infos->numcaches++;
+  }
 
+  cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+  if (cache) {
     for (cachenum = 0; ; cachenum++) {
       unsigned long linesize, linepart, ways, sets;
-      unsigned type;
       eax = 0x8000001d;
       ecx = cachenum;
       cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
 
-      type = eax & 0x1f;
-
-      if (type == 0)
+      if ((eax & 0x1f) == 0)
 	break;
+      switch (eax & 0x1f) {
+      case 1: cache->type = HWLOC_OBJ_CACHE_DATA; break;
+      case 2: cache->type = HWLOC_OBJ_CACHE_INSTRUCTION; break;
+      default: cache->type = HWLOC_OBJ_CACHE_UNIFIED; break;
+      }
 
-      cache->type = type;
       cache->level = (eax >> 5) & 0x7;
       /* Note: actually number of cores */
       cache->nbthreads_sharing = ((eax >> 14) &  0xfff) + 1;
@@ -411,86 +347,71 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
 	cache->ways = ways;
       cache->sets = sets = ecx + 1;
       cache->size = linesize * linepart * ways * sets;
-      cache->inclusiveness = edx & 0x2;
-
+      cache->inclusive = edx & 0x2;
 
-      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+      hwloc_debug("cache %u L%u%c t%u linesize %lu linepart %lu ways %lu sets %lu, size %luKB\n",
+		  cachenum, cache->level,
+		  cache->type == HWLOC_OBJ_CACHE_DATA ? 'd' : cache->type == HWLOC_OBJ_CACHE_INSTRUCTION ? 'i' : 'u',
+		  cache->nbthreads_sharing, linesize, linepart, ways, sets, cache->size >> 10);
 
       cache++;
     }
   } else {
-    /* If there's no topoext,
-     * get cache information from cpuid 0x80000005 and 0x80000006
-     * (not supported on Intel)
-     */
-    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000005) {
-      eax = 0x80000005;
-      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-      fill_amd_cache(infos, 1, 1, ecx); /* L1d */
-      fill_amd_cache(infos, 1, 2, edx); /* L1i */
-    }
-    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000006) {
-      eax = 0x80000006;
-      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-      if (ecx & 0xf000)
-	/* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
-	 * Could be useful if some Intels (at least before Core micro-architecture)
-	 * support this leaf without leaf 0x4.
-	 */
-	fill_amd_cache(infos, 2, 3, ecx); /* L2u */
-      if (edx & 0xf000)
-	fill_amd_cache(infos, 3, 3, edx); /* L3u */
-      /* FIXME: AMD MagnyCours family 0x10 model 0x9 with 8 cores or more actually
-       * have the L3 split in two halves, and associativity is divided as well (48)
-       */
-    }
+    infos->numcaches = 0;
   }
+}
 
-  /* Get thread/core + cache information from cpuid 0x04
-   * (not supported on AMD)
-   */
-  if (cpuid_type != amd && highest_cpuid >= 0x04) {
-    for (cachenum = 0; ; cachenum++) {
-      unsigned type;
-      eax = 0x04;
-      ecx = cachenum;
-      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-
-      type = eax & 0x1f;
+/* Intel cache info from CPUID 0x04 leaf */
+static void read_intel_caches(struct hwloc_x86_backend_data_s *data, struct procinfo *infos, struct cpuiddump *src_cpuiddump)
+{
+  unsigned level;
+  struct cacheinfo *tmpcaches;
+  unsigned eax, ebx, ecx, edx;
+  unsigned oldnumcaches = infos->numcaches; /* in case we got caches above */
+  unsigned cachenum;
+  struct cacheinfo *cache;
 
-      hwloc_debug("cache %u type %u\n", cachenum, type);
+  for (cachenum = 0; ; cachenum++) {
+    eax = 0x04;
+    ecx = cachenum;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
 
-      if (type == 0)
-	break;
-      infos->numcaches++;
-
-      if (!cachenum) {
-	/* by the way, get thread/core information from the first cache */
-	infos->max_nbcores = ((eax >> 26) & 0x3f) + 1;
-	infos->max_nbthreads = infos->max_log_proc / infos->max_nbcores;
-	hwloc_debug("thus %u threads\n", infos->max_nbthreads);
-	infos->threadid = infos->logprocid % infos->max_nbthreads;
-	infos->coreid = infos->logprocid / infos->max_nbthreads;
-	hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
-      }
-    }
+    hwloc_debug("cache %u type %u\n", cachenum, eax & 0x1f);
+    if ((eax & 0x1f) == 0)
+      break;
+    level = (eax >> 5) & 0x7;
+    if (data->is_knl && level == 3)
+      /* KNL reports wrong L3 information (size always 0, cpuset always the entire machine, ignore it */
+      break;
+    infos->numcaches++;
+  }
 
-    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+  tmpcaches = realloc(infos->cache, infos->numcaches * sizeof(*infos->cache));
+  if (!tmpcaches) {
+    infos->numcaches = oldnumcaches;
+  } else {
+    infos->cache = tmpcaches;
+    cache = &infos->cache[oldnumcaches];
 
     for (cachenum = 0; ; cachenum++) {
       unsigned long linesize, linepart, ways, sets;
-      unsigned type;
       eax = 0x04;
       ecx = cachenum;
       cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
 
-      type = eax & 0x1f;
-
-      if (type == 0)
+      if ((eax & 0x1f) == 0)
+	break;
+      level = (eax >> 5) & 0x7;
+      if (data->is_knl && level == 3)
+	/* KNL reports wrong L3 information (size always 0, cpuset always the entire machine, ignore it */
 	break;
+      switch (eax & 0x1f) {
+      case 1: cache->type = HWLOC_OBJ_CACHE_DATA; break;
+      case 2: cache->type = HWLOC_OBJ_CACHE_INSTRUCTION; break;
+      default: cache->type = HWLOC_OBJ_CACHE_UNIFIED; break;
+      }
 
-      cache->type = type;
-      cache->level = (eax >> 5) & 0x7;
+      cache->level = level;
       cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
 
       cache->linesize = linesize = (ebx & 0xfff) + 1;
@@ -503,32 +424,122 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
         cache->ways = ways;
       cache->sets = sets = ecx + 1;
       cache->size = linesize * linepart * ways * sets;
-      cache->inclusiveness = edx & 0x2;
-
-      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+      cache->inclusive = edx & 0x2;
 
+      hwloc_debug("cache %u L%u%c t%u linesize %lu linepart %lu ways %lu sets %lu, size %luKB\n",
+		  cachenum, cache->level,
+		  cache->type == HWLOC_OBJ_CACHE_DATA ? 'd' : cache->type == HWLOC_OBJ_CACHE_INSTRUCTION ? 'i' : 'u',
+		  cache->nbthreads_sharing, linesize, linepart, ways, sets, cache->size >> 10);
       cache++;
     }
   }
+}
 
-  /* Get package/core/thread information from cpuid 0x0b
-   * (Intel x2APIC)
+/* AMD core/thread info from CPUID 0x80000008 leaf */
+static void read_amd_cores_legacy(struct procinfo *infos, struct cpuiddump *src_cpuiddump)
+{
+  unsigned eax, ebx, ecx, edx;
+  unsigned max_nbcores;
+  unsigned max_nbthreads;
+  unsigned coreidsize;
+  unsigned logprocid;
+  unsigned threadid __hwloc_attribute_unused;
+
+  eax = 0x80000008;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+  coreidsize = (ecx >> 12) & 0xf;
+  hwloc_debug("core ID size: %u\n", coreidsize);
+  if (!coreidsize) {
+    max_nbcores = (ecx & 0xff) + 1;
+  } else
+    max_nbcores = 1 << coreidsize;
+  hwloc_debug("Thus max # of cores: %u\n", max_nbcores);
+
+  /* No multithreaded AMD for this old CPUID leaf */
+  max_nbthreads = 1 ;
+  hwloc_debug("and max # of threads: %u\n", max_nbthreads);
+
+  /* legacy_max_log_proc is deprecated, it can be smaller than max_nbcores,
+   * which is the maximum number of cores that the processor could theoretically support
+   * (see "Multiple Core Calculation" in the AMD CPUID specification).
+   * Recompute packageid/coreid accordingly.
    */
-  if (cpuid_type == intel && has_x2apic(features)) {
-    unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
-    for (level = 0; ; level++) {
-      ecx = level;
-      eax = 0x0b;
-      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
-      if (!eax && !ebx)
-        break;
+  infos->ids[PKG] = infos->apicid / max_nbcores;
+  logprocid = infos->apicid % max_nbcores;
+  infos->ids[CORE] = logprocid / max_nbthreads;
+  threadid = logprocid % max_nbthreads;
+  hwloc_debug("this is thread %u of core %u\n", threadid, infos->ids[CORE]);
+}
+
+/* AMD unit/node from CPUID 0x8000001e leaf (topoext) */
+static void read_amd_cores_topoext(struct procinfo *infos, unsigned long flags, struct cpuiddump *src_cpuiddump)
+{
+  unsigned apic_id, nodes_per_proc = 0;
+  unsigned eax, ebx, ecx, edx;
+
+  eax = 0x8000001e;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  infos->apicid = apic_id = eax;
+
+  if (flags & HWLOC_X86_DISC_FLAG_TOPOEXT_NUMANODES) {
+    if (infos->cpufamilynumber == 0x16) {
+      /* ecx is reserved */
+      infos->ids[NODE] = 0;
+      nodes_per_proc = 1;
+    } else {
+      /* AMD other families or Hygon family 18h */
+      infos->ids[NODE] = ecx & 0xff;
+      nodes_per_proc = ((ecx >> 8) & 7) + 1;
     }
-    if (level) {
+    if ((infos->cpufamilynumber == 0x15 && nodes_per_proc > 2)
+	|| ((infos->cpufamilynumber == 0x17 || infos->cpufamilynumber == 0x18) && nodes_per_proc > 4)) {
+      hwloc_debug("warning: undefined nodes_per_proc value %u, assuming it means %u\n", nodes_per_proc, nodes_per_proc);
+    }
+  }
+
+  if (infos->cpufamilynumber <= 0x16) { /* topoext appeared in 0x15 and compute-units were only used in 0x15 and 0x16 */
+    unsigned cores_per_unit;
+    /* coreid was obtained from read_amd_cores_legacy() earlier */
+    infos->ids[UNIT] = ebx & 0xff;
+    cores_per_unit = ((ebx >> 8) & 0xff) + 1;
+    hwloc_debug("topoext %08x, %u nodes, node %u, %u cores in unit %u\n", apic_id, nodes_per_proc, infos->ids[NODE], cores_per_unit, infos->ids[UNIT]);
+    /* coreid and unitid are package-wide (core 0-15 and unit 0-7 on 16-core 2-NUMAnode processor).
+     * The Linux kernel reduces theses to NUMA-node-wide (by applying %core_per_node and %unit_per node respectively).
+     * It's not clear if we should do this as well.
+     */
+  } else {
+    unsigned threads_per_core;
+    infos->ids[CORE] = ebx & 0xff;
+    threads_per_core = ((ebx >> 8) & 0xff) + 1;
+    hwloc_debug("topoext %08x, %u nodes, node %u, %u threads in core %u\n", apic_id, nodes_per_proc, infos->ids[NODE], threads_per_core, infos->ids[CORE]);
+  }
+}
+
+/* Intel core/thread or even die/module/tile from CPUID 0x0b or 0x1f leaves (v1 and v2 extended topology enumeration) */
+static void read_intel_cores_exttopoenum(struct procinfo *infos, unsigned leaf, struct cpuiddump *src_cpuiddump)
+{
+  unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
+  unsigned threadid __hwloc_attribute_unused = 0; /* shut-up compiler */
+  unsigned eax, ebx, ecx = 0, edx;
+  int apic_packageshift = 0;
+
+  for (level = 0; ; level++) {
+    ecx = level;
+    eax = leaf;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    if (!eax && !ebx)
+      break;
+    apic_packageshift = eax & 0x1f;
+  }
+
+  if (level) {
+    infos->otherids = malloc(level * sizeof(*infos->otherids));
+    if (infos->otherids) {
       infos->levels = level;
-      infos->otherids = malloc(level * sizeof(*infos->otherids));
       for (level = 0; ; level++) {
 	ecx = level;
-	eax = 0x0b;
+	eax = leaf;
 	cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
 	if (!eax && !ebx)
 	  break;
@@ -536,28 +547,259 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
 	apic_number = ebx & 0xffff;
 	apic_type = (ecx & 0xff00) >> 8;
 	apic_id = edx;
-	id = (apic_id >> apic_shift) & ((1 << (apic_nextshift - apic_shift)) - 1);
-	hwloc_debug("x2APIC %08x %d: nextshift %d num %2d type %d id %2d\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
+	id = (apic_id >> apic_shift) & ((1 << (apic_packageshift - apic_shift)) - 1);
+	hwloc_debug("x2APIC %08x %u: nextshift %u num %2u type %u id %2u\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
 	infos->apicid = apic_id;
 	infos->otherids[level] = UINT_MAX;
 	switch (apic_type) {
 	case 1:
-	  infos->threadid = id;
+	  threadid = id;
+	  /* apic_number is the actual number of threads per core */
 	  break;
 	case 2:
-	  infos->coreid = id;
+	  infos->ids[CORE] = id;
+	  /* apic_number is the actual number of threads per die */
+	  break;
+	case 3:
+	  infos->ids[MODULE] = id;
+	  /* apic_number is the actual number of threads per tile */
+	  break;
+	case 4:
+	  infos->ids[TILE] = id;
+	  /* apic_number is the actual number of threads per die */
+	  break;
+	case 5:
+	  infos->ids[DIE] = id;
+	  /* apic_number is the actual number of threads per package */
 	  break;
 	default:
-	  hwloc_debug("x2APIC %d: unknown type %d\n", level, apic_type);
+	  hwloc_debug("x2APIC %u: unknown type %u\n", level, apic_type);
 	  infos->otherids[level] = apic_id >> apic_shift;
 	  break;
 	}
 	apic_shift = apic_nextshift;
       }
       infos->apicid = apic_id;
-      infos->packageid = apic_id >> apic_shift;
-      hwloc_debug("x2APIC remainder: %d\n", infos->packageid);
-      hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+      infos->ids[PKG] = apic_id >> apic_shift;
+      hwloc_debug("x2APIC remainder: %u\n", infos->ids[PKG]);
+      hwloc_debug("this is thread %u of core %u\n", threadid, infos->ids[CORE]);
+    }
+  }
+}
+
+/* Fetch information from the processor itself thanks to cpuid and store it in
+ * infos for summarize to analyze them globally */
+static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned long flags, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned cachenum;
+  struct cacheinfo *cache;
+  unsigned regs[4];
+  unsigned legacy_max_log_proc; /* not valid on Intel processors with > 256 threads, or when cpuid 0x80000008 is supported */
+  unsigned legacy_log_proc_id;
+  unsigned _model, _extendedmodel, _family, _extendedfamily;
+
+  infos->present = 1;
+
+  /* Get apicid, legacy_max_log_proc, packageid, legacy_log_proc_id from cpuid 0x01 */
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  infos->apicid = ebx >> 24;
+  if (edx & (1 << 28))
+    legacy_max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
+  else
+    legacy_max_log_proc = 1;
+  hwloc_debug("APIC ID 0x%02x legacy_max_log_proc %u\n", infos->apicid, legacy_max_log_proc);
+  infos->ids[PKG] = infos->apicid / legacy_max_log_proc;
+  legacy_log_proc_id = infos->apicid % legacy_max_log_proc;
+  hwloc_debug("phys %u legacy thread %u\n", infos->ids[PKG], legacy_log_proc_id);
+
+  /* Get cpu model/family/stepping numbers from same cpuid */
+  _model          = (eax>>4) & 0xf;
+  _extendedmodel  = (eax>>16) & 0xf;
+  _family         = (eax>>8) & 0xf;
+  _extendedfamily = (eax>>20) & 0xff;
+  if ((cpuid_type == intel || cpuid_type == amd || cpuid_type == hygon) && _family == 0xf) {
+    infos->cpufamilynumber = _family + _extendedfamily;
+  } else {
+    infos->cpufamilynumber = _family;
+  }
+  if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
+      || ((cpuid_type == amd || cpuid_type == hygon) && _family == 0xf)
+      || (cpuid_type == zhaoxin && (_family == 0x6 || _family == 0x7))) {
+    infos->cpumodelnumber = _model + (_extendedmodel << 4);
+  } else {
+    infos->cpumodelnumber = _model;
+  }
+  infos->cpustepping = eax & 0xf;
+
+  if (cpuid_type == intel && infos->cpufamilynumber == 0x6 &&
+      (infos->cpumodelnumber == 0x57 || infos->cpumodelnumber == 0x85))
+    data->is_knl = 1; /* KNM is the same as KNL */
+
+  /* Get cpu vendor string from cpuid 0x00 */
+  memset(regs, 0, sizeof(regs));
+  regs[0] = 0;
+  cpuid_or_from_dump(&regs[0], &regs[1], &regs[3], &regs[2], src_cpuiddump);
+  memcpy(infos->cpuvendor, regs+1, 4*3);
+  /* infos was calloc'ed, already ends with \0 */
+
+  /* Get cpu model string from cpuid 0x80000002-4 */
+  if (highest_ext_cpuid >= 0x80000004) {
+    memset(regs, 0, sizeof(regs));
+    regs[0] = 0x80000002;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel, regs, 4*4);
+    regs[0] = 0x80000003;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4, regs, 4*4);
+    regs[0] = 0x80000004;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
+    /* infos was calloc'ed, already ends with \0 */
+  }
+
+  if ((cpuid_type != amd && cpuid_type != hygon) && highest_cpuid >= 0x04) {
+    /* Get core/thread information from first cache reported by cpuid 0x04
+     * (not supported on AMD)
+     */
+    eax = 0x04;
+    ecx = 0;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    if ((eax & 0x1f) != 0) {
+      /* cache looks valid */
+      unsigned max_nbcores;
+      unsigned max_nbthreads;
+      unsigned threadid __hwloc_attribute_unused;
+      max_nbcores = ((eax >> 26) & 0x3f) + 1;
+      max_nbthreads = legacy_max_log_proc / max_nbcores;
+      hwloc_debug("thus %u threads\n", max_nbthreads);
+      threadid = legacy_log_proc_id % max_nbthreads;
+      infos->ids[CORE] = legacy_log_proc_id / max_nbthreads;
+      hwloc_debug("this is thread %u of core %u\n", threadid, infos->ids[CORE]);
+    }
+  }
+
+  /*********************************************************************************
+   * Get the hierarchy of thread, core, die, package, etc. from CPU-specific leaves
+   */
+
+  if (cpuid_type != intel && cpuid_type != zhaoxin && highest_ext_cpuid >= 0x80000008 && !has_x2apic(features)) {
+    /* Get core/thread information from cpuid 0x80000008
+     * (not supported on Intel)
+     * We could ignore this codepath when x2apic is supported, but we may need
+     * nodeids if HWLOC_X86_TOPOEXT_NUMANODES is set.
+     */
+    read_amd_cores_legacy(infos, src_cpuiddump);
+  }
+
+  if (cpuid_type != intel && cpuid_type != zhaoxin && has_topoext(features)) {
+    /* Get apicid, nodeid, unitid/coreid from cpuid 0x8000001e (AMD topology extension).
+     * Requires read_amd_cores_legacy() for coreid on family 0x15-16.
+     *
+     * Only needed when x2apic supported if NUMA nodes are needed.
+     */
+    read_amd_cores_topoext(infos, flags, src_cpuiddump);
+  }
+
+  if ((cpuid_type == intel) && highest_cpuid >= 0x1f) {
+    /* Get package/die/module/tile/core/thread information from cpuid 0x1f
+     * (Intel v2 Extended Topology Enumeration)
+     */
+    read_intel_cores_exttopoenum(infos, 0x1f, src_cpuiddump);
+
+  } else if ((cpuid_type == intel || cpuid_type == amd || cpuid_type == zhaoxin)
+	     && highest_cpuid >= 0x0b && has_x2apic(features)) {
+    /* Get package/core/thread information from cpuid 0x0b
+     * (Intel v1 Extended Topology Enumeration)
+     */
+    read_intel_cores_exttopoenum(infos, 0x0b, src_cpuiddump);
+  }
+
+  /**************************************
+   * Get caches from CPU-specific leaves
+   */
+
+  infos->numcaches = 0;
+  infos->cache = NULL;
+
+  if (cpuid_type != intel && cpuid_type != zhaoxin && has_topoext(features)) {
+    /* Get cache information from cpuid 0x8000001d (AMD topology extension) */
+    read_amd_caches_topoext(infos, src_cpuiddump);
+
+  } else if (cpuid_type != intel && cpuid_type != zhaoxin && highest_ext_cpuid >= 0x80000006) {
+    /* If there's no topoext,
+     * get cache information from cpuid 0x80000005 and 0x80000006.
+     * (not supported on Intel)
+     * It looks like we cannot have 0x80000005 without 0x80000006.
+     */
+    read_amd_caches_legacy(infos, src_cpuiddump, legacy_max_log_proc);
+  }
+
+  if ((cpuid_type != amd && cpuid_type != hygon) && highest_cpuid >= 0x04) {
+    /* Get cache information from cpuid 0x04
+     * (not supported on AMD)
+     */
+    read_intel_caches(data, infos, src_cpuiddump);
+  }
+
+  /* Now that we have all info, compute cacheids and apply quirks */
+  for (cachenum = 0; cachenum < infos->numcaches; cachenum++) {
+    cache = &infos->cache[cachenum];
+
+    /* default cacheid value */
+    cache->cacheid = infos->apicid / cache->nbthreads_sharing;
+
+    if (cpuid_type == amd) {
+      /* AMD quirks */
+      if (infos->cpufamilynumber == 0x17
+	  && cache->level == 3 && cache->nbthreads_sharing == 6) {
+	/* AMD family 0x17 always shares L3 between 8 APIC ids,
+	 * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
+	 * (on 24-core CPUs).
+	 */
+	cache->cacheid = infos->apicid / 8;
+
+      } else if (infos->cpufamilynumber== 0x10 && infos->cpumodelnumber == 0x9
+	  && cache->level == 3
+	  && (cache->ways == -1 || (cache->ways % 2 == 0)) && cache->nbthreads_sharing >= 8) {
+	/* Fix AMD family 0x10 model 0x9 (Magny-Cours) with 8 or 12 cores.
+	 * The L3 (and its associativity) is actually split into two halves).
+	 */
+	if (cache->nbthreads_sharing == 16)
+	  cache->nbthreads_sharing = 12; /* nbthreads_sharing is a power of 2 but the processor actually has 8 or 12 cores */
+	cache->nbthreads_sharing /= 2;
+	cache->size /= 2;
+	if (cache->ways != -1)
+	  cache->ways /= 2;
+	/* AMD Magny-Cours 12-cores processor reserve APIC ids as AAAAAABBBBBB....
+	 * among first L3 (A), second L3 (B), and unexisting cores (.).
+	 * On multi-socket servers, L3 in non-first sockets may have APIC id ranges
+	 * such as [16-21] that are not aligned on multiple of nbthreads_sharing (6).
+	 * That means, we can't just compare apicid/nbthreads_sharing to identify siblings.
+	 */
+	cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
+	  + 2 * (infos->apicid / legacy_max_log_proc); /* add 2 caches per previous package */
+
+      } else if (infos->cpufamilynumber == 0x15
+		 && (infos->cpumodelnumber == 0x1 /* Bulldozer */ || infos->cpumodelnumber == 0x2 /* Piledriver */)
+		 && cache->level == 3 && cache->nbthreads_sharing == 6) {
+	/* AMD Bulldozer and Piledriver 12-core processors have same APIC ids as Magny-Cours below,
+	 * but we can't merge the checks because the original nbthreads_sharing must be exactly 6 here.
+	 */
+	cache->cacheid = (infos->apicid % legacy_max_log_proc) / cache->nbthreads_sharing /* cacheid within the package */
+	  + 2 * (infos->apicid / legacy_max_log_proc); /* add 2 cache per previous package */
+      }
+    } else if (cpuid_type == hygon) {
+      if (infos->cpufamilynumber == 0x18
+	  && cache->level == 3 && cache->nbthreads_sharing == 6) {
+        /* Hygon family 0x18 always shares L3 between 8 APIC ids,
+         * even when only 6 APIC ids are enabled and reported in nbthreads_sharing
+         * (on 24-core CPUs).
+         */
+        cache->cacheid = infos->apicid / 8;
+      }
     }
   }
 
@@ -568,35 +810,84 @@ static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, uns
 }
 
 static void
-hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int nodup)
+hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int replace)
 {
-  char number[8];
-  hwloc_obj_add_info_nodup(obj, "CPUVendor", info->cpuvendor, nodup);
+  char number[12];
+  if (info->cpuvendor[0])
+    hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUVendor", info->cpuvendor, replace);
   snprintf(number, sizeof(number), "%u", info->cpufamilynumber);
-  hwloc_obj_add_info_nodup(obj, "CPUFamilyNumber", number, nodup);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUFamilyNumber", number, replace);
   snprintf(number, sizeof(number), "%u", info->cpumodelnumber);
-  hwloc_obj_add_info_nodup(obj, "CPUModelNumber", number, nodup);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUModelNumber", number, replace);
   if (info->cpumodel[0]) {
     const char *c = info->cpumodel;
     while (*c == ' ')
       c++;
-    hwloc_obj_add_info_nodup(obj, "CPUModel", c, nodup);
+    hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUModel", c, replace);
   }
   snprintf(number, sizeof(number), "%u", info->cpustepping);
-  hwloc_obj_add_info_nodup(obj, "CPUStepping", number, nodup);
+  hwloc__add_info_nodup(&obj->infos, &obj->infos_count, "CPUStepping", number, replace);
+}
+
+static void
+hwloc_x86_add_groups(hwloc_topology_t topology,
+		     struct procinfo *infos,
+		     unsigned nbprocs,
+		     hwloc_bitmap_t remaining_cpuset,
+		     unsigned type,
+		     const char *subtype,
+		     unsigned kind,
+		     int dont_merge)
+{
+  hwloc_bitmap_t obj_cpuset;
+  hwloc_obj_t obj;
+  unsigned i, j;
+
+  while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+    unsigned packageid = infos[i].ids[PKG];
+    unsigned id = infos[i].ids[type];
+
+    if (id == (unsigned)-1) {
+      hwloc_bitmap_clr(remaining_cpuset, i);
+      continue;
+    }
+
+    obj_cpuset = hwloc_bitmap_alloc();
+    for (j = i; j < nbprocs; j++) {
+      if (infos[j].ids[type] == (unsigned) -1) {
+	hwloc_bitmap_clr(remaining_cpuset, j);
+	continue;
+      }
+
+      if (infos[j].ids[PKG] == packageid && infos[j].ids[type] == id) {
+	hwloc_bitmap_set(obj_cpuset, j);
+	hwloc_bitmap_clr(remaining_cpuset, j);
+      }
+    }
+
+    obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, id);
+    obj->cpuset = obj_cpuset;
+    obj->subtype = strdup(subtype);
+    obj->attr->group.kind = kind;
+    obj->attr->group.dont_merge = dont_merge;
+    hwloc_debug_2args_bitmap("os %s %u has cpuset %s\n",
+			     subtype, id, obj_cpuset);
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
 }
 
 /* Analyse information stored in infos, and build/annotate topology levels accordingly */
-static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery)
+static void summarize(struct hwloc_backend *backend, struct procinfo *infos, unsigned long flags)
 {
   struct hwloc_topology *topology = backend->topology;
   struct hwloc_x86_backend_data_s *data = backend->private_data;
   unsigned nbprocs = data->nbprocs;
   hwloc_bitmap_t complete_cpuset = hwloc_bitmap_alloc();
-  unsigned i, j, l, level, type;
-  unsigned nbpackages = 0;
+  unsigned i, j, l, level;
   int one = -1;
-  unsigned next_group_depth = topology->next_group_depth;
+  hwloc_bitmap_t remaining_cpuset;
+  int gotnuma = 0;
+  int fulldiscovery = (flags & HWLOC_X86_DISC_FLAG_FULL);
 
   for (i = 0; i < nbprocs; i++)
     if (infos[i].present) {
@@ -609,256 +900,233 @@ static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int
     return;
   }
 
+  remaining_cpuset = hwloc_bitmap_alloc();
+
   /* Ideally, when fulldiscovery=0, we could add any object that doesn't exist yet.
    * But what if the x86 and the native backends disagree because one is buggy? Which one to trust?
-   * Only annotate existing objects for now.
+   * We only add missing caches, and annotate other existing objects for now.
    */
 
- /*Anotate previously existing objects*/
-  if(!fulldiscovery){
-    hwloc_obj_t pu;
-    nbpackages = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
-    for(pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU  ,NULL);
-     pu!=NULL;
-     pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU ,pu)){
-      unsigned infoId = pu->os_index;
-      if(infoId<0)
-        continue;
-      
-      int numCaches = infos[infoId].numcaches;
-      struct cacheinfo **caches = malloc(numCaches*sizeof(struct cacheinfo*));
-      int i;
-      for(i = 0 ;i<numCaches;i++){
-        caches[i] = &(infos[infoId].cache[i]);
-      }
-
-
-      hwloc_obj_t object;
-      for(object = pu;object!=NULL;object = object->parent) {
-        switch(object->type){
-        /* Annotate packages previously-existing cache */
-        case HWLOC_OBJ_CACHE:
-          {
-            if (hwloc_obj_get_info_by_name(object,"inclusiveness"))
-              break;
-            unsigned char type = 0;
-            switch(object->attr->cache.type){
-              case HWLOC_OBJ_CACHE_DATA : type = 1;
-                break;
-              case HWLOC_OBJ_CACHE_INSTRUCTION : type = 2;
-                break;
-              case HWLOC_OBJ_CACHE_UNIFIED : type = 3;
-                break;
-            }
-            int cacheId =-1; 
-            for(i=0;i<numCaches;i++)
-              if(caches[i]->level == object->attr->cache.depth){ // the level is exact, not always the type. If at the level there is a cache with the good type we return it. Else we return a random cache of the level. 
-                cacheId = i;
-                if(caches[i]->type == type)
-                  break;
-              }
-            if (cacheId >= 0)
-                hwloc_obj_add_info(object,"inclusiveness",caches[cacheId]->inclusiveness?"true":"false");
-
-          }
-          break;
-        case HWLOC_OBJ_PACKAGE:
-          { 
-            /* Annotate packages previously-existing package */
-	    // FIXME: ideally, we should check all bits in case x86 and the native backend disagree. 
-	       
-            //We already know the pakage from topology-linux. We only check if the package detected by x86 doesn't disagree
-	    if (infos[i].packageid == object->os_index || object->os_index == (unsigned) -1) { 
-	      hwloc_x86_add_cpuinfos(object, &infos[infoId], 1);
-            }
-          }
-        break;
-	default:
-	break;
-	}
-      }
-      free(caches);
-    }
-  }
-
-
-  /* Look for packages */
-  if (fulldiscovery) {
-    hwloc_bitmap_t packages_cpuset = hwloc_bitmap_dup(complete_cpuset);
-    hwloc_bitmap_t package_cpuset;
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
+    /* Look for packages */
     hwloc_obj_t package;
 
-    while ((i = hwloc_bitmap_first(packages_cpuset)) != (unsigned) -1) {
-      unsigned packageid = infos[i].packageid;
+    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+      if (fulldiscovery) {
+	unsigned packageid = infos[i].ids[PKG];
+	hwloc_bitmap_t package_cpuset = hwloc_bitmap_alloc();
 
-      package_cpuset = hwloc_bitmap_alloc();
-      for (j = i; j < nbprocs; j++) {
-        if (infos[j].packageid == packageid) {
-          hwloc_bitmap_set(package_cpuset, j);
-          hwloc_bitmap_clr(packages_cpuset, j);
-        }
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].ids[PKG] == packageid) {
+	    hwloc_bitmap_set(package_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	package = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, packageid);
+	package->cpuset = package_cpuset;
+
+	hwloc_x86_add_cpuinfos(package, &infos[i], 0);
+
+	hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+				packageid, package_cpuset);
+	hwloc_insert_object_by_cpuset(topology, package);
+
+      } else {
+	/* Annotate packages previously-existing packages */
+	hwloc_bitmap_t set = hwloc_bitmap_alloc();
+	hwloc_bitmap_set(set, i);
+	package = hwloc_get_next_obj_covering_cpuset_by_type(topology, set, HWLOC_OBJ_PACKAGE, NULL);
+	hwloc_bitmap_free(set);
+	if (package) {
+	  /* Found package above that PU, annotate if no such attribute yet */
+	  hwloc_x86_add_cpuinfos(package, &infos[i], 1);
+	  hwloc_bitmap_andnot(remaining_cpuset, remaining_cpuset, package->cpuset);
+	} else {
+	  /* No package, annotate the root object */
+	  hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[i], 1);
+	  break;
+	}
       }
-      package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, packageid);
-      package->cpuset = package_cpuset;
-
-      hwloc_x86_add_cpuinfos(package, &infos[i], 0);
-
-      hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
-          packageid, package_cpuset);
-      hwloc_insert_object_by_cpuset(topology, package);
-      nbpackages++;
     }
-    hwloc_bitmap_free(packages_cpuset);
-
   }
 
-  /* If there was no package, annotate the Machine instead */
-  if ((!nbpackages) && infos[0].cpumodel[0]) {
-    hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[0], 1);
-  }
-
-  /* Look for Numa nodes inside packages */
-  if (fulldiscovery) {
-    hwloc_bitmap_t nodes_cpuset = hwloc_bitmap_dup(complete_cpuset);
+  /* Look for Numa nodes inside packages (cannot be filtered-out) */
+  if (fulldiscovery && (flags & HWLOC_X86_DISC_FLAG_TOPOEXT_NUMANODES)) {
     hwloc_bitmap_t node_cpuset;
     hwloc_obj_t node;
 
     /* FIXME: if there's memory inside the root object, divide it into NUMA nodes? */
 
-    while ((i = hwloc_bitmap_first(nodes_cpuset)) != (unsigned) -1) {
-      unsigned packageid = infos[i].packageid;
-      unsigned nodeid = infos[i].nodeid;
+    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].ids[PKG];
+      unsigned nodeid = infos[i].ids[NODE];
 
       if (nodeid == (unsigned)-1) {
-        hwloc_bitmap_clr(nodes_cpuset, i);
+        hwloc_bitmap_clr(remaining_cpuset, i);
 	continue;
       }
 
       node_cpuset = hwloc_bitmap_alloc();
       for (j = i; j < nbprocs; j++) {
-	if (infos[j].nodeid == (unsigned) -1) {
-	  hwloc_bitmap_clr(nodes_cpuset, j);
+	if (infos[j].ids[NODE] == (unsigned) -1) {
+	  hwloc_bitmap_clr(remaining_cpuset, j);
 	  continue;
 	}
 
-        if (infos[j].packageid == packageid && infos[j].nodeid == nodeid) {
+        if (infos[j].ids[PKG] == packageid && infos[j].ids[NODE] == nodeid) {
           hwloc_bitmap_set(node_cpuset, j);
-          hwloc_bitmap_clr(nodes_cpuset, j);
+          hwloc_bitmap_clr(remaining_cpuset, j);
         }
       }
-      node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, nodeid);
+      node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, nodeid);
       node->cpuset = node_cpuset;
       node->nodeset = hwloc_bitmap_alloc();
       hwloc_bitmap_set(node->nodeset, nodeid);
       hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
           nodeid, node_cpuset);
       hwloc_insert_object_by_cpuset(topology, node);
+      gotnuma++;
     }
-    hwloc_bitmap_free(nodes_cpuset);
   }
 
-  /* Look for Compute units inside packages */
-  if (fulldiscovery) {
-    hwloc_bitmap_t units_cpuset = hwloc_bitmap_dup(complete_cpuset);
-    hwloc_bitmap_t unit_cpuset;
-    hwloc_obj_t unit;
-
-    while ((i = hwloc_bitmap_first(units_cpuset)) != (unsigned) -1) {
-      unsigned packageid = infos[i].packageid;
-      unsigned unitid = infos[i].unitid;
-
-      if (unitid == (unsigned)-1) {
-        hwloc_bitmap_clr(units_cpuset, i);
-	continue;
-      }
-
-      unit_cpuset = hwloc_bitmap_alloc();
-      for (j = i; j < nbprocs; j++) {
-	if (infos[j].unitid == (unsigned) -1) {
-	  hwloc_bitmap_clr(units_cpuset, j);
-	  continue;
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+    if (fulldiscovery) {
+      /* Look for AMD Compute units inside packages */
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      hwloc_x86_add_groups(topology, infos, nbprocs, remaining_cpuset,
+			   UNIT, "Compute Unit",
+			   HWLOC_GROUP_KIND_AMD_COMPUTE_UNIT, 0);
+      /* Look for Intel Modules inside packages */
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      hwloc_x86_add_groups(topology, infos, nbprocs, remaining_cpuset,
+			   MODULE, "Module",
+			   HWLOC_GROUP_KIND_INTEL_MODULE, 0);
+      /* Look for Intel Tiles inside packages */
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      hwloc_x86_add_groups(topology, infos, nbprocs, remaining_cpuset,
+			   TILE, "Tile",
+			   HWLOC_GROUP_KIND_INTEL_TILE, 0);
+
+      /* Look for unknown objects */
+      if (infos[one].otherids) {
+	for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
+	  if (infos[one].otherids[level] != UINT_MAX) {
+	    hwloc_bitmap_t unknown_cpuset;
+	    hwloc_obj_t unknown_obj;
+
+	    hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+	    while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	      unsigned unknownid = infos[i].otherids[level];
+
+	      unknown_cpuset = hwloc_bitmap_alloc();
+	      for (j = i; j < nbprocs; j++) {
+		if (infos[j].otherids[level] == unknownid) {
+		  hwloc_bitmap_set(unknown_cpuset, j);
+		  hwloc_bitmap_clr(remaining_cpuset, j);
+		}
+	      }
+	      unknown_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, unknownid);
+	      unknown_obj->cpuset = unknown_cpuset;
+	      unknown_obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_EXTTOPOENUM_UNKNOWN;
+	      unknown_obj->attr->group.subkind = level;
+	      hwloc_debug_2args_bitmap("os unknown%u %u has cpuset %s\n",
+				       level, unknownid, unknown_cpuset);
+	      hwloc_insert_object_by_cpuset(topology, unknown_obj);
+	    }
+	  }
 	}
-
-        if (infos[j].packageid == packageid && infos[j].unitid == unitid) {
-          hwloc_bitmap_set(unit_cpuset, j);
-          hwloc_bitmap_clr(units_cpuset, j);
-        }
       }
-      unit = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unitid);
-      unit->cpuset = unit_cpuset;
-      hwloc_debug_1arg_bitmap("os unit %u has cpuset %s\n",
-          unitid, unit_cpuset);
-      hwloc_insert_object_by_cpuset(topology, unit);
     }
-    hwloc_bitmap_free(units_cpuset);
   }
 
-  /* Look for unknown objects */
-  if (infos[one].otherids) {
-    for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
-      if (infos[one].otherids[level] != UINT_MAX) {
-	hwloc_bitmap_t unknowns_cpuset = hwloc_bitmap_dup(complete_cpuset);
-	hwloc_bitmap_t unknown_cpuset;
-	hwloc_obj_t unknown_obj;
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_DIE)) {
+    /* Look for Intel Dies inside packages */
+    if (fulldiscovery) {
+      hwloc_bitmap_t die_cpuset;
+      hwloc_obj_t die;
 
-	while ((i = hwloc_bitmap_first(unknowns_cpuset)) != (unsigned) -1) {
-	  unsigned unknownid = infos[i].otherids[level];
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	unsigned packageid = infos[i].ids[PKG];
+	unsigned dieid = infos[i].ids[DIE];
 
-	  unknown_cpuset = hwloc_bitmap_alloc();
-	  for (j = i; j < nbprocs; j++) {
-	    if (infos[j].otherids[level] == unknownid) {
-	      hwloc_bitmap_set(unknown_cpuset, j);
-	      hwloc_bitmap_clr(unknowns_cpuset, j);
-	    }
+	if (dieid == (unsigned) -1) {
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
+
+	die_cpuset = hwloc_bitmap_alloc();
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].ids[DIE] == (unsigned) -1) {
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	    continue;
+	  }
+
+	  if (infos[j].ids[PKG] == packageid && infos[j].ids[DIE] == dieid) {
+	    hwloc_bitmap_set(die_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
 	  }
-	  unknown_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unknownid);
-	  unknown_obj->cpuset = unknown_cpuset;
-	  unknown_obj->attr->group.depth = topology->next_group_depth + level;
-	  if (next_group_depth <= topology->next_group_depth + level)
-	    next_group_depth = topology->next_group_depth + level + 1;
-	  hwloc_debug_2args_bitmap("os unknown%d %u has cpuset %s\n",
-	      level, unknownid, unknown_cpuset);
-	  hwloc_insert_object_by_cpuset(topology, unknown_obj);
 	}
-	hwloc_bitmap_free(unknowns_cpuset);
+	die = hwloc_alloc_setup_object(topology, HWLOC_OBJ_DIE, dieid);
+	die->cpuset = die_cpuset;
+	hwloc_debug_1arg_bitmap("os die %u has cpuset %s\n",
+				dieid, die_cpuset);
+	hwloc_insert_object_by_cpuset(topology, die);
       }
     }
   }
 
-  /* Look for cores */
-  if (fulldiscovery) {
-    hwloc_bitmap_t cores_cpuset = hwloc_bitmap_dup(complete_cpuset);
-    hwloc_bitmap_t core_cpuset;
-    hwloc_obj_t core;
+  if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
+    /* Look for cores */
+    if (fulldiscovery) {
+      hwloc_bitmap_t core_cpuset;
+      hwloc_obj_t core;
 
-    while ((i = hwloc_bitmap_first(cores_cpuset)) != (unsigned) -1) {
-      unsigned packageid = infos[i].packageid;
-      unsigned coreid = infos[i].coreid;
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	unsigned packageid = infos[i].ids[PKG];
+	unsigned nodeid = infos[i].ids[NODE];
+	unsigned coreid = infos[i].ids[CORE];
 
-      if (coreid == (unsigned) -1) {
-        hwloc_bitmap_clr(cores_cpuset, i);
-	continue;
-      }
-
-      core_cpuset = hwloc_bitmap_alloc();
-      for (j = i; j < nbprocs; j++) {
-	if (infos[j].coreid == (unsigned) -1) {
-	  hwloc_bitmap_clr(cores_cpuset, j);
+	if (coreid == (unsigned) -1) {
+	  hwloc_bitmap_clr(remaining_cpuset, i);
 	  continue;
 	}
 
-        if (infos[j].packageid == packageid && infos[j].coreid == coreid) {
-          hwloc_bitmap_set(core_cpuset, j);
-          hwloc_bitmap_clr(cores_cpuset, j);
-        }
+	core_cpuset = hwloc_bitmap_alloc();
+	for (j = i; j < nbprocs; j++) {
+	  if (infos[j].ids[CORE] == (unsigned) -1) {
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	    continue;
+	  }
+
+	  if (infos[j].ids[PKG] == packageid && infos[j].ids[NODE] == nodeid && infos[j].ids[CORE] == coreid) {
+	    hwloc_bitmap_set(core_cpuset, j);
+	    hwloc_bitmap_clr(remaining_cpuset, j);
+	  }
+	}
+	core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, coreid);
+	core->cpuset = core_cpuset;
+	hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+				coreid, core_cpuset);
+	hwloc_insert_object_by_cpuset(topology, core);
       }
-      core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, coreid);
-      core->cpuset = core_cpuset;
-      hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
-          coreid, core_cpuset);
-      hwloc_insert_object_by_cpuset(topology, core);
     }
-    hwloc_bitmap_free(cores_cpuset);
+  }
+
+  /* Look for PUs (cannot be filtered-out) */
+  if (fulldiscovery) {
+    hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+    for (i=0; i<nbprocs; i++)
+      if(infos[i].present) { /* Only add present PU. We don't know if others actually exist */
+       struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, i);
+       obj->cpuset = hwloc_bitmap_alloc();
+       hwloc_bitmap_only(obj->cpuset, i);
+       hwloc_debug_1arg_bitmap("PU %u has cpuset %s\n", i, obj->cpuset);
+       hwloc_insert_object_by_cpuset(topology, obj);
+     }
   }
 
   /* Look for caches */
@@ -868,91 +1136,96 @@ static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int
     for (j = 0; j < infos[i].numcaches; j++)
       if (infos[i].cache[j].level > level)
         level = infos[i].cache[j].level;
-
-  /* Look for known types */
-  if (fulldiscovery) while (level > 0) {
-    for (type = 1; type <= 3; type++) {
+  while (level > 0) {
+    hwloc_obj_cache_type_t type;
+    HWLOC_BUILD_ASSERT(HWLOC_OBJ_CACHE_DATA == HWLOC_OBJ_CACHE_UNIFIED+1);
+    HWLOC_BUILD_ASSERT(HWLOC_OBJ_CACHE_INSTRUCTION == HWLOC_OBJ_CACHE_DATA+1);
+    for (type = HWLOC_OBJ_CACHE_UNIFIED; type <= HWLOC_OBJ_CACHE_INSTRUCTION; type++) {
       /* Look for caches of that type at level level */
-      {
-	hwloc_bitmap_t caches_cpuset = hwloc_bitmap_dup(complete_cpuset);
-	hwloc_bitmap_t cache_cpuset;
-	hwloc_obj_t cache;
+      hwloc_obj_type_t otype;
+      hwloc_obj_t cache;
 
-	while ((i = hwloc_bitmap_first(caches_cpuset)) != (unsigned) -1) {
-	  unsigned packageid = infos[i].packageid;
+      otype = hwloc_cache_type_by_depth_type(level, type);
+      if (otype == HWLOC_OBJ_TYPE_NONE)
+	continue;
+      if (!hwloc_filter_check_keep_object_type(topology, otype))
+	continue;
 
-	  for (l = 0; l < infos[i].numcaches; l++) {
-	    if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
-	      break;
-	  }
-	  if (l == infos[i].numcaches) {
-	    /* no cache Llevel of that type in i */
-	    hwloc_bitmap_clr(caches_cpuset, i);
-	    continue;
-	  }
+      hwloc_bitmap_copy(remaining_cpuset, complete_cpuset);
+      while ((i = hwloc_bitmap_first(remaining_cpuset)) != (unsigned) -1) {
+	hwloc_bitmap_t puset;
 
-	  /* Found a matching cache, now look for others sharing it */
-	  {
-	    unsigned cacheid = infos[i].apicid / infos[i].cache[l].nbthreads_sharing;
+	for (l = 0; l < infos[i].numcaches; l++) {
+	  if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
+	    break;
+	}
+	if (l == infos[i].numcaches) {
+	  /* no cache Llevel of that type in i */
+	  hwloc_bitmap_clr(remaining_cpuset, i);
+	  continue;
+	}
 
-	    cache_cpuset = hwloc_bitmap_alloc();
-	    for (j = i; j < nbprocs; j++) {
-	      unsigned l2;
-	      for (l2 = 0; l2 < infos[j].numcaches; l2++) {
-		if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
-		  break;
-	      }
-	      if (l2 == infos[j].numcaches) {
-		/* no cache Llevel of that type in j */
-		hwloc_bitmap_clr(caches_cpuset, j);
-		continue;
-	      }
-	      if (infos[j].packageid == packageid && infos[j].apicid / infos[j].cache[l2].nbthreads_sharing == cacheid) {
-		hwloc_bitmap_set(cache_cpuset, j);
-		hwloc_bitmap_clr(caches_cpuset, j);
-	      }
-	    }
-	    cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, cacheid);
-	    cache->attr->cache.depth = level;
-	    cache->attr->cache.size = infos[i].cache[l].size;
-	    cache->attr->cache.linesize = infos[i].cache[l].linesize;
-	    cache->attr->cache.associativity = infos[i].cache[l].ways;
-	    switch (infos[i].cache[l].type) {
-	      case 1:
-		cache->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
-		break;
-	      case 2:
-		cache->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
-		break;
-	      case 3:
-		cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+	puset = hwloc_bitmap_alloc();
+	hwloc_bitmap_set(puset, i);
+	cache = hwloc_get_next_obj_covering_cpuset_by_type(topology, puset, otype, NULL);
+	hwloc_bitmap_free(puset);
+
+	if (cache) {
+	  /* Found cache above that PU, annotate if no such attribute yet */
+	  if (!hwloc_obj_get_info_by_name(cache, "Inclusive"))
+	    hwloc_obj_add_info(cache, "Inclusive", infos[i].cache[l].inclusive ? "1" : "0");
+	  hwloc_bitmap_andnot(remaining_cpuset, remaining_cpuset, cache->cpuset);
+	} else {
+	  /* Add the missing cache */
+	  hwloc_bitmap_t cache_cpuset;
+	  unsigned packageid = infos[i].ids[PKG];
+	  unsigned cacheid = infos[i].cache[l].cacheid;
+	  /* Now look for others sharing it */
+	  cache_cpuset = hwloc_bitmap_alloc();
+	  for (j = i; j < nbprocs; j++) {
+	    unsigned l2;
+	    for (l2 = 0; l2 < infos[j].numcaches; l2++) {
+	      if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
 		break;
 	    }
-            hwloc_obj_add_info(cache,"inclusiveness",infos[i].cache[l].inclusiveness?"true":"false");
-	    cache->cpuset = cache_cpuset;
-	    hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
-		level, cacheid, cache_cpuset);
-	    hwloc_insert_object_by_cpuset(topology, cache);
+	    if (l2 == infos[j].numcaches) {
+	      /* no cache Llevel of that type in j */
+	      hwloc_bitmap_clr(remaining_cpuset, j);
+	      continue;
+	    }
+	    if (infos[j].ids[PKG] == packageid && infos[j].cache[l2].cacheid == cacheid) {
+	      hwloc_bitmap_set(cache_cpuset, j);
+	      hwloc_bitmap_clr(remaining_cpuset, j);
+	    }
 	  }
+	  cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
+	  cache->attr->cache.depth = level;
+	  cache->attr->cache.size = infos[i].cache[l].size;
+	  cache->attr->cache.linesize = infos[i].cache[l].linesize;
+	  cache->attr->cache.associativity = infos[i].cache[l].ways;
+	  cache->attr->cache.type = infos[i].cache[l].type;
+	  cache->cpuset = cache_cpuset;
+	  hwloc_obj_add_info(cache, "Inclusive", infos[i].cache[l].inclusive ? "1" : "0");
+	  hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
+				   level, cacheid, cache_cpuset);
+	  hwloc_insert_object_by_cpuset(topology, cache);
 	}
-	hwloc_bitmap_free(caches_cpuset);
       }
     }
     level--;
   }
 
-  for (i = 0; i < nbprocs; i++) {
-    free(infos[i].cache);
-    if (infos[i].otherids)
-      free(infos[i].otherids);
-  }
+  /* FIXME: if KNL and L2 disabled, add tiles instead of L2 */
 
+  hwloc_bitmap_free(remaining_cpuset);
   hwloc_bitmap_free(complete_cpuset);
-  topology->next_group_depth = next_group_depth;
+
+  if (gotnuma)
+    topology->support.discovery->numa = 1;
 }
 
 static int
-look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery,
+look_procs(struct hwloc_backend *backend, struct procinfo *infos, unsigned long flags,
 	   unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type,
 	   int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags),
 	   int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags))
@@ -977,16 +1250,18 @@ look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscov
     struct cpuiddump *src_cpuiddump = NULL;
     if (data->src_cpuiddump_path) {
       src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, i);
+      if (!src_cpuiddump)
+	continue;
     } else {
       hwloc_bitmap_only(set, i);
-      hwloc_debug("binding to CPU%d\n", i);
+      hwloc_debug("binding to CPU%u\n", i);
       if (set_cpubind(topology, set, HWLOC_CPUBIND_STRICT)) {
-	hwloc_debug("could not bind to CPU%d: %s\n", i, strerror(errno));
+	hwloc_debug("could not bind to CPU%u: %s\n", i, strerror(errno));
 	continue;
       }
     }
 
-    look_proc(backend, &infos[i], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+    look_proc(backend, &infos[i], flags, highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
 
     if (data->src_cpuiddump_path) {
       cpuiddump_free(src_cpuiddump);
@@ -999,10 +1274,11 @@ look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscov
     hwloc_bitmap_free(orig_cpuset);
   }
 
-  if (!data->apicid_unique)
-    fulldiscovery = 0;
-  summarize(backend, infos, fulldiscovery);
-  return fulldiscovery; /* success, but objects added only if fulldiscovery */
+  if (data->apicid_unique)
+    summarize(backend, infos, flags);
+  /* if !data->apicid_unique, do nothing and return success, so that the caller does nothing either */
+
+  return 0;
 }
 
 #if defined HWLOC_FREEBSD_SYS && defined HAVE_CPUSET_SETID
@@ -1030,15 +1306,30 @@ static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state __hwloc_attribut
 static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
 #endif /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
 
-
+/* GenuineIntel */
 #define INTEL_EBX ('G' | ('e'<<8) | ('n'<<16) | ('u'<<24))
 #define INTEL_EDX ('i' | ('n'<<8) | ('e'<<16) | ('I'<<24))
 #define INTEL_ECX ('n' | ('t'<<8) | ('e'<<16) | ('l'<<24))
 
+/* AuthenticAMD */
 #define AMD_EBX ('A' | ('u'<<8) | ('t'<<16) | ('h'<<24))
 #define AMD_EDX ('e' | ('n'<<8) | ('t'<<16) | ('i'<<24))
 #define AMD_ECX ('c' | ('A'<<8) | ('M'<<16) | ('D'<<24))
 
+/* HYGON "HygonGenuine" */
+#define HYGON_EBX ('H' | ('y'<<8) | ('g'<<16) | ('o'<<24))
+#define HYGON_EDX ('n' | ('G'<<8) | ('e'<<16) | ('n'<<24))
+#define HYGON_ECX ('u' | ('i'<<8) | ('n'<<16) | ('e'<<24))
+
+/* (Zhaoxin) CentaurHauls */
+#define ZX_EBX ('C' | ('e'<<8) | ('n'<<16) | ('t'<<24))
+#define ZX_EDX ('a' | ('u'<<8) | ('r'<<16) | ('H'<<24))
+#define ZX_ECX ('a' | ('u'<<8) | ('l'<<16) | ('s'<<24))
+/* (Zhaoxin) Shanghai */
+#define SH_EBX (' ' | (' '<<8) | ('S'<<16) | ('h'<<24))
+#define SH_EDX ('a' | ('n'<<8) | ('g'<<16) | ('h'<<24))
+#define SH_ECX ('a' | ('i'<<8) | (' '<<16) | (' '<<24))
+
 /* fake cpubind for when nbprocs=1 and no binding support */
 static int fake_get_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
 			    hwloc_cpuset_t set __hwloc_attribute_unused,
@@ -1054,7 +1345,7 @@ static int fake_set_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
 }
 
 static
-int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
+int hwloc_look_x86(struct hwloc_backend *backend, unsigned long flags)
 {
   struct hwloc_x86_backend_data_s *data = backend->private_data;
   unsigned nbprocs = data->nbprocs;
@@ -1076,19 +1367,31 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
   int ret = -1;
 
   if (data->src_cpuiddump_path) {
-    /* just read cpuid from the dump */
+    /* Just read cpuid from the dump (implies !topology->is_thissystem by default) */
     src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, 0);
+    if (!src_cpuiddump)
+      goto out;
+
   } else {
-    /* otherwise check if binding works */
+    /* Using real hardware.
+     * However we don't enforce topology->is_thissystem so that
+     * we may still force use this backend when debugging with !thissystem.
+     */
+
+    /* check if binding works */
     memset(&hooks, 0, sizeof(hooks));
     support.membind = &memsupport;
     hwloc_set_native_binding_hooks(&hooks, &support);
-    if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
-      get_cpubind = hooks.get_thisproc_cpubind;
-      set_cpubind = hooks.set_thisproc_cpubind;
-    } else if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
+    if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
       get_cpubind = hooks.get_thisthread_cpubind;
       set_cpubind = hooks.set_thisthread_cpubind;
+    } else if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
+      /* FIXME: if called by a multithreaded program, we will restore the original process binding
+       * for each thread instead of their own original thread binding.
+       * See issue #158.
+       */
+      get_cpubind = hooks.get_thisproc_cpubind;
+      set_cpubind = hooks.set_thisproc_cpubind;
     } else {
       /* we need binding support if there are multiple PUs */
       if (nbprocs > 1)
@@ -1105,11 +1408,13 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
   if (NULL == infos)
     goto out;
   for (i = 0; i < nbprocs; i++) {
-    infos[i].nodeid = (unsigned) -1;
-    infos[i].packageid = (unsigned) -1;
-    infos[i].unitid = (unsigned) -1;
-    infos[i].coreid = (unsigned) -1;
-    infos[i].threadid = (unsigned) -1;
+    infos[i].ids[PKG] = (unsigned) -1;
+    infos[i].ids[CORE] = (unsigned) -1;
+    infos[i].ids[NODE] = (unsigned) -1;
+    infos[i].ids[UNIT] = (unsigned) -1;
+    infos[i].ids[TILE] = (unsigned) -1;
+    infos[i].ids[MODULE] = (unsigned) -1;
+    infos[i].ids[DIE] = (unsigned) -1;
   }
 
   eax = 0x00;
@@ -1117,8 +1422,13 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
   highest_cpuid = eax;
   if (ebx == INTEL_EBX && ecx == INTEL_ECX && edx == INTEL_EDX)
     cpuid_type = intel;
-  if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
+  else if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
     cpuid_type = amd;
+  else if ((ebx == ZX_EBX && ecx == ZX_ECX && edx == ZX_EDX)
+	   || (ebx == SH_EBX && ecx == SH_ECX && edx == SH_EDX))
+    cpuid_type = zhaoxin;
+  else if (ebx == HYGON_EBX && ecx == HYGON_ECX && edx == HYGON_EDX)
+    cpuid_type = hygon;
 
   hwloc_debug("highest cpuid %x, cpuid type %u\n", highest_cpuid, cpuid_type);
   if (highest_cpuid < 0x01) {
@@ -1138,6 +1448,7 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
 
   if (highest_cpuid >= 0x7) {
     eax = 0x7;
+    ecx = 0;
     cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
     features[9] = ebx;
   }
@@ -1151,18 +1462,18 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
 
   hwloc_x86_os_state_save(&os_state, src_cpuiddump);
 
-  ret = look_procs(backend, infos, fulldiscovery,
+  ret = look_procs(backend, infos, flags,
 		   highest_cpuid, highest_ext_cpuid, features, cpuid_type,
 		   get_cpubind, set_cpubind);
-  if (ret >= 0)
+  if (!ret)
     /* success, we're done */
     goto out_with_os_state;
 
   if (nbprocs == 1) {
     /* only one processor, no need to bind */
-    look_proc(backend, &infos[0], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
-    summarize(backend, infos, fulldiscovery);
-    ret = fulldiscovery;
+    look_proc(backend, &infos[0], flags, highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+    summarize(backend, infos, flags);
+    ret = 0;
   }
 
 out_with_os_state:
@@ -1170,7 +1481,11 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
 
 out_with_infos:
   if (NULL != infos) {
-      free(infos);
+    for (i = 0; i < nbprocs; i++) {
+      free(infos[i].cache);
+      free(infos[i].otherids);
+    }
+    free(infos);
   }
 
 out:
@@ -1180,20 +1495,39 @@ int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
 }
 
 static int
-hwloc_x86_discover(struct hwloc_backend *backend)
+hwloc_x86_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
 {
   struct hwloc_x86_backend_data_s *data = backend->private_data;
   struct hwloc_topology *topology = backend->topology;
+  unsigned long flags = 0;
   int alreadypus = 0;
   int ret;
 
-  if (!data->src_cpuiddump_path) {
-    data->nbprocs = hwloc_fallback_nbprocessors(topology);
+  assert(dstatus->phase == HWLOC_DISC_PHASE_CPU);
 
-    if (!topology->is_thissystem) {
-      hwloc_debug("%s", "\nno x86 detection (not thissystem)\n");
-      return 0;
-    }
+  if (getenv("HWLOC_X86_TOPOEXT_NUMANODES")) {
+    flags |= HWLOC_X86_DISC_FLAG_TOPOEXT_NUMANODES;
+  }
+
+#if HAVE_DECL_RUNNING_ON_VALGRIND
+  if (RUNNING_ON_VALGRIND && !data->src_cpuiddump_path) {
+    fprintf(stderr, "hwloc x86 backend cannot work under Valgrind, disabling.\n"
+	    "May be reenabled by dumping CPUIDs with hwloc-gather-cpuid\n"
+	    "and reloading them under Valgrind with HWLOC_CPUID_PATH.\n");
+    return 0;
+  }
+#endif
+
+  if (data->src_cpuiddump_path) {
+    assert(data->nbprocs > 0); /* enforced by hwloc_x86_component_instantiate() */
+    topology->support.discovery->pu = 1;
+  } else {
+    int nbprocs = hwloc_fallback_nbprocessors(HWLOC_FALLBACK_NBPROCESSORS_INCLUDE_OFFLINE);
+    if (nbprocs >= 1)
+      topology->support.discovery->pu = 1;
+    else
+      nbprocs = 1;
+    data->nbprocs = (unsigned) nbprocs;
   }
 
   if (topology->levels[0][0]->cpuset) {
@@ -1204,22 +1538,23 @@ hwloc_x86_discover(struct hwloc_backend *backend)
       goto fulldiscovery;
     }
 
-    /* several object types were added, we can't easily complete, just annotate a bit */
-    ret = hwloc_look_x86(backend, 0);
+    /* several object types were added, we can't easily complete, just do partial discovery */
+    hwloc_topology_reconnect(topology, 0);
+    ret = hwloc_look_x86(backend, flags);
     if (ret)
       hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
     return 0;
   } else {
     /* topology is empty, initialize it */
-    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+    hwloc_alloc_root_sets(topology->levels[0][0]);
   }
 
 fulldiscovery:
-  hwloc_look_x86(backend, 1);
-  /* if failed, just continue and create PUs */
-
-  if (!alreadypus)
-    hwloc_setup_pu_level(topology, data->nbprocs);
+  if (hwloc_look_x86(backend, flags | HWLOC_X86_DISC_FLAG_FULL) < 0) {
+    /* if failed, create PUs */
+    if (!alreadypus)
+      hwloc_setup_pu_level(topology, data->nbprocs);
+  }
 
   hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
 
@@ -1242,6 +1577,8 @@ hwloc_x86_discover(struct hwloc_backend *backend)
 static int
 hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t set)
 {
+
+#if !(defined HWLOC_WIN_SYS && !defined __MINGW32__ && !defined __CYGWIN__) /* needs a lot of work */
   struct dirent *dirent;
   DIR *dir;
   char *path;
@@ -1249,31 +1586,27 @@ hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t s
   char line [32];
 
   dir = opendir(src_cpuiddump_path);
-  if (!dir)
+  if (!dir) 
     return -1;
 
   path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1);
   if (!path)
     goto out_with_dir;
-
   sprintf(path, "%s/hwloc-cpuid-info", src_cpuiddump_path);
   file = fopen(path, "r");
   if (!file) {
     fprintf(stderr, "Couldn't open dumped cpuid summary %s\n", path);
-    free(path);
-    goto out_with_dir;
+    goto out_with_path;
   }
   if (!fgets(line, sizeof(line), file)) {
     fprintf(stderr, "Found read dumped cpuid summary in %s\n", path);
     fclose(file);
-    free(path);
-    goto out_with_dir;
+    goto out_with_path;
   }
   fclose(file);
   if (strcmp(line, "Architecture: x86\n")) {
     fprintf(stderr, "Found non-x86 dumped cpuid summary in %s: %s\n", path, line);
-    free(path);
-    goto out_with_dir;
+    goto out_with_path;
   }
   free(path);
 
@@ -1303,8 +1636,11 @@ hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t s
 
   return 0;
 
-out_with_dir:
+ out_with_path:
+  free(path);
+ out_with_dir:
   closedir(dir);
+#endif /* HWLOC_WIN_SYS & !__MINGW32__ needs a lot of work */
   return -1;
 }
 
@@ -1313,13 +1649,14 @@ hwloc_x86_backend_disable(struct hwloc_backend *backend)
 {
   struct hwloc_x86_backend_data_s *data = backend->private_data;
   hwloc_bitmap_free(data->apicid_set);
-  if (data->src_cpuiddump_path)
-    free(data->src_cpuiddump_path);
+  free(data->src_cpuiddump_path);
   free(data);
 }
 
 static struct hwloc_backend *
-hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
+hwloc_x86_component_instantiate(struct hwloc_topology *topology,
+				struct hwloc_disc_component *component,
+				unsigned excluded_phases __hwloc_attribute_unused,
 				const void *_data1 __hwloc_attribute_unused,
 				const void *_data2 __hwloc_attribute_unused,
 				const void *_data3 __hwloc_attribute_unused)
@@ -1328,7 +1665,7 @@ hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
   struct hwloc_x86_backend_data_s *data;
   const char *src_cpuiddump_path;
 
-  backend = hwloc_backend_alloc(component);
+  backend = hwloc_backend_alloc(topology, component);
   if (!backend)
     goto out;
 
@@ -1339,11 +1676,11 @@ hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
   }
 
   backend->private_data = data;
-  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
   backend->discover = hwloc_x86_discover;
   backend->disable = hwloc_x86_backend_disable;
 
   /* default values */
+  data->is_knl = 0;
   data->apicid_set = hwloc_bitmap_alloc();
   data->apicid_unique = 1;
   data->src_cpuiddump_path = NULL;
@@ -1354,6 +1691,7 @@ hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
     if (!hwloc_x86_check_cpuiddump_input(src_cpuiddump_path, set)) {
       backend->is_thissystem = 0;
       data->src_cpuiddump_path = strdup(src_cpuiddump_path);
+      assert(!hwloc_bitmap_iszero(set)); /* enforced by hwloc_x86_check_cpuiddump_input() */
       data->nbprocs = hwloc_bitmap_weight(set);
     } else {
       fprintf(stderr, "Ignoring dumped cpuid directory.\n");
@@ -1370,11 +1708,12 @@ hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
 }
 
 static struct hwloc_disc_component hwloc_x86_disc_component = {
-  HWLOC_DISC_COMPONENT_TYPE_CPU,
   "x86",
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  HWLOC_DISC_PHASE_CPU,
+  HWLOC_DISC_PHASE_GLOBAL,
   hwloc_x86_component_instantiate,
   45, /* between native and no_os */
+  1,
   NULL
 };
 
diff --git a/ext/hwloc/hwloc/topology-xml.c b/ext/hwloc/hwloc/topology-xml.c
new file mode 100644
index 000000000..f6bb210c9
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-xml.c
@@ -0,0 +1,3102 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2019 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2018 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/xml.h"
+#include "private/private.h"
+#include "private/misc.h"
+#include "private/debug.h"
+
+#include <math.h>
+
+int
+hwloc__xml_verbose(void)
+{
+  static int checked = 0;
+  static int verbose = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_XML_VERBOSE");
+    if (env)
+      verbose = atoi(env);
+    checked = 1;
+  }
+  return verbose;
+}
+
+static int
+hwloc_nolibxml_import(void)
+{
+  static int checked = 0;
+  static int nolibxml = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_LIBXML");
+    if (env) {
+      nolibxml = !atoi(env);
+    } else {
+      env = getenv("HWLOC_LIBXML_IMPORT");
+      if (env)
+	nolibxml = !atoi(env);
+    }
+    checked = 1;
+  }
+  return nolibxml;
+}
+
+static int
+hwloc_nolibxml_export(void)
+{
+  static int checked = 0;
+  static int nolibxml = 0;
+  if (!checked) {
+    const char *env = getenv("HWLOC_LIBXML");
+    if (env) {
+      nolibxml = !atoi(env);
+    } else {
+      env = getenv("HWLOC_LIBXML_EXPORT");
+      if (env)
+	nolibxml = !atoi(env);
+    }
+    checked = 1;
+  }
+  return nolibxml;
+}
+
+#define BASE64_ENCODED_LENGTH(length) (4*(((length)+2)/3))
+
+/*********************************
+ ********* XML callbacks *********
+ *********************************/
+
+/* set when registering nolibxml and libxml components.
+ * modifications protected by the components mutex.
+ * read by the common XML code in topology-xml.c to jump to the right XML backend.
+ */
+static struct hwloc_xml_callbacks *hwloc_nolibxml_callbacks = NULL, *hwloc_libxml_callbacks = NULL;
+
+void
+hwloc_xml_callbacks_register(struct hwloc_xml_component *comp)
+{
+  if (!hwloc_nolibxml_callbacks)
+    hwloc_nolibxml_callbacks = comp->nolibxml_callbacks;
+  if (!hwloc_libxml_callbacks)
+    hwloc_libxml_callbacks = comp->libxml_callbacks;
+}
+
+void
+hwloc_xml_callbacks_reset(void)
+{
+  hwloc_nolibxml_callbacks = NULL;
+  hwloc_libxml_callbacks = NULL;
+}
+
+/************************************************
+ ********* XML import (common routines) *********
+ ************************************************/
+
+#define _HWLOC_OBJ_CACHE_OLD (HWLOC_OBJ_TYPE_MAX+1) /* temporarily used when importing pre-v2.0 attribute-less cache types */
+#define _HWLOC_OBJ_FUTURE    (HWLOC_OBJ_TYPE_MAX+2) /* temporarily used when ignoring future types */
+
+static void
+hwloc__xml_import_object_attr(struct hwloc_topology *topology,
+			      struct hwloc_xml_backend_data_s *data,
+			      struct hwloc_obj *obj,
+			      const char *name, const char *value,
+			      hwloc__xml_import_state_t state)
+{
+  if (!strcmp(name, "type")) {
+    /* already handled */
+    return;
+  }
+
+  else if (!strcmp(name, "os_index"))
+    obj->os_index = strtoul(value, NULL, 10);
+  else if (!strcmp(name, "gp_index")) {
+    obj->gp_index = strtoull(value, NULL, 10);
+    if (!obj->gp_index && hwloc__xml_verbose())
+      fprintf(stderr, "%s: unexpected zero gp_index, topology may be invalid\n", state->global->msgprefix);
+    if (obj->gp_index >= topology->next_gp_index)
+      topology->next_gp_index = obj->gp_index + 1;
+  } else if (!strcmp(name, "cpuset")) {
+    if (!obj->cpuset)
+      obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->cpuset, value);
+  } else if (!strcmp(name, "complete_cpuset")) {
+    if (!obj->complete_cpuset)
+      obj->complete_cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->complete_cpuset, value);
+  } else if (!strcmp(name, "allowed_cpuset")) {
+    /* ignored except for root */
+    if (!obj->parent)
+      hwloc_bitmap_sscanf(topology->allowed_cpuset, value);
+  } else if (!strcmp(name, "nodeset")) {
+    if (!obj->nodeset)
+      obj->nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->nodeset, value);
+  } else if (!strcmp(name, "complete_nodeset")) {
+    if (!obj->complete_nodeset)
+      obj->complete_nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_sscanf(obj->complete_nodeset, value);
+  } else if (!strcmp(name, "allowed_nodeset")) {
+    /* ignored except for root */
+    if (!obj->parent)
+      hwloc_bitmap_sscanf(topology->allowed_nodeset, value);
+  } else if (!strcmp(name, "name")) {
+    if (obj->name)
+      free(obj->name);
+    obj->name = strdup(value);
+  } else if (!strcmp(name, "subtype")) {
+    if (obj->subtype)
+      free(obj->subtype);
+    obj->subtype = strdup(value);
+  }
+
+  else if (!strcmp(name, "cache_size")) {
+    unsigned long long lvalue = strtoull(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD || obj->type == HWLOC_OBJ_MEMCACHE)
+      obj->attr->cache.size = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_size attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_linesize")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD || obj->type == HWLOC_OBJ_MEMCACHE)
+      obj->attr->cache.linesize = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_linesize attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_associativity")) {
+    int lvalue = atoi(value);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD || obj->type == HWLOC_OBJ_MEMCACHE)
+      obj->attr->cache.associativity = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_associativity attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "cache_type")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD || obj->type == HWLOC_OBJ_MEMCACHE) {
+      if (lvalue == HWLOC_OBJ_CACHE_UNIFIED
+	  || lvalue == HWLOC_OBJ_CACHE_DATA
+	  || lvalue == HWLOC_OBJ_CACHE_INSTRUCTION)
+	obj->attr->cache.type = (hwloc_obj_cache_type_t) lvalue;
+      else
+	fprintf(stderr, "%s: ignoring invalid cache_type attribute %lu\n",
+		state->global->msgprefix, lvalue);
+    } else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring cache_type attribute for non-cache object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "local_memory")) {
+    unsigned long long lvalue = strtoull(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      obj->attr->numanode.local_memory = lvalue;
+    else if (!obj->parent)
+      topology->machine_memory.local_memory = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring local_memory attribute for non-NUMAnode non-root object\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "depth")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+     if (hwloc__obj_type_is_cache(obj->type) || obj->type == _HWLOC_OBJ_CACHE_OLD || obj->type == HWLOC_OBJ_MEMCACHE) {
+	obj->attr->cache.depth = lvalue;
+     } else if (obj->type == HWLOC_OBJ_GROUP || obj->type == HWLOC_OBJ_BRIDGE) {
+       /* will be overwritten by the core */
+     } else if (hwloc__xml_verbose())
+       fprintf(stderr, "%s: ignoring depth attribute for object type without depth\n",
+	       state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "kind")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.kind = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring kind attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "subkind")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.subkind = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring subkind attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "dont_merge")) {
+    unsigned long lvalue = strtoul(value, NULL, 10);
+    if (obj->type == HWLOC_OBJ_GROUP)
+      obj->attr->group.dont_merge = lvalue;
+    else if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring dont_merge attribute for non-group object type\n",
+	      state->global->msgprefix);
+  }
+
+  else if (!strcmp(name, "pci_busid")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned domain, bus, dev, func;
+      if (sscanf(value, "%04x:%02x:%02x.%01x",
+		 &domain, &bus, &dev, &func) != 4) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid pci_busid format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->pcidev.domain = domain;
+	obj->attr->pcidev.bus = bus;
+	obj->attr->pcidev.dev = dev;
+	obj->attr->pcidev.func = func;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_busid attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "pci_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned classid, vendor, device, subvendor, subdevice, revision;
+      if (sscanf(value, "%04x [%04x:%04x] [%04x:%04x] %02x",
+		 &classid, &vendor, &device, &subvendor, &subdevice, &revision) != 6) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid pci_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->pcidev.class_id = classid;
+	obj->attr->pcidev.vendor_id = vendor;
+	obj->attr->pcidev.device_id = device;
+	obj->attr->pcidev.subvendor_id = subvendor;
+	obj->attr->pcidev.subdevice_id = subdevice;
+	obj->attr->pcidev.revision = revision;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_type attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "pci_link_speed")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_BRIDGE: {
+      obj->attr->pcidev.linkspeed = (float) atof(value);
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring pci_link_speed attribute for non-PCI object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "bridge_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned upstream_type, downstream_type;
+      if (sscanf(value, "%u-%u", &upstream_type, &downstream_type) != 2) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid bridge_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->bridge.upstream_type = (hwloc_obj_bridge_type_t) upstream_type;
+	obj->attr->bridge.downstream_type = (hwloc_obj_bridge_type_t) downstream_type;
+      };
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring bridge_type attribute for non-bridge object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "bridge_pci")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_BRIDGE: {
+      unsigned domain, secbus, subbus;
+      if (sscanf(value, "%04x:[%02x-%02x]",
+		 &domain, &secbus, &subbus) != 3) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid bridge_pci format string %s\n",
+		  state->global->msgprefix, value);
+      } else {
+	obj->attr->bridge.downstream.pci.domain = domain;
+	obj->attr->bridge.downstream.pci.secondary_bus = secbus;
+	obj->attr->bridge.downstream.pci.subordinate_bus = subbus;
+      }
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring bridge_pci attribute for non-bridge object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (!strcmp(name, "osdev_type")) {
+    switch (obj->type) {
+    case HWLOC_OBJ_OS_DEVICE: {
+      unsigned osdev_type;
+      if (sscanf(value, "%u", &osdev_type) != 1) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring invalid osdev_type format string %s\n",
+		  state->global->msgprefix, value);
+      } else
+	obj->attr->osdev.type = (hwloc_obj_osdev_type_t) osdev_type;
+      break;
+    }
+    default:
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring osdev_type attribute for non-osdev object\n",
+		state->global->msgprefix);
+      break;
+    }
+  }
+
+  else if (data->version_major < 2) {
+    /************************
+     * deprecated from 1.x
+     */
+    if (!strcmp(name, "os_level")
+	|| !strcmp(name, "online_cpuset"))
+      { /* ignored */ }
+
+    /*************************
+     * deprecated from 1.0
+     */
+    else if (!strcmp(name, "dmi_board_vendor")) {
+      if (value[0])
+	hwloc_obj_add_info(obj, "DMIBoardVendor", value);
+    }
+    else if (!strcmp(name, "dmi_board_name")) {
+      if (value[0])
+	hwloc_obj_add_info(obj, "DMIBoardName", value);
+    }
+
+    else if (data->version_major < 1) {
+      /*************************
+       * deprecated from 0.9
+       */
+      if (!strcmp(name, "memory_kB")) {
+	unsigned long long lvalue = strtoull(value, NULL, 10);
+	if (obj->type == _HWLOC_OBJ_CACHE_OLD)
+	  obj->attr->cache.size = lvalue << 10;
+	else if (obj->type == HWLOC_OBJ_NUMANODE)
+	  obj->attr->numanode.local_memory = lvalue << 10;
+	else if (!obj->parent)
+	  topology->machine_memory.local_memory = lvalue << 10;
+	else if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring memory_kB attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+      }
+      else if (!strcmp(name, "huge_page_size_kB")) {
+	unsigned long lvalue = strtoul(value, NULL, 10);
+	if (obj->type == HWLOC_OBJ_NUMANODE || !obj->parent) {
+	  struct hwloc_numanode_attr_s *memory = obj->type == HWLOC_OBJ_NUMANODE ? &obj->attr->numanode : &topology->machine_memory;
+	  if (!memory->page_types) {
+	    memory->page_types = malloc(sizeof(*memory->page_types));
+	    memory->page_types_len = 1;
+	  }
+	  memory->page_types[0].size = lvalue << 10;
+	} else if (hwloc__xml_verbose()) {
+	  fprintf(stderr, "%s: ignoring huge_page_size_kB attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+	}
+      }
+      else if (!strcmp(name, "huge_page_free")) {
+	unsigned long lvalue = strtoul(value, NULL, 10);
+	if (obj->type == HWLOC_OBJ_NUMANODE || !obj->parent) {
+	  struct hwloc_numanode_attr_s *memory = obj->type == HWLOC_OBJ_NUMANODE ? &obj->attr->numanode : &topology->machine_memory;
+	  if (!memory->page_types) {
+	    memory->page_types = malloc(sizeof(*memory->page_types));
+	    memory->page_types_len = 1;
+	  }
+	  memory->page_types[0].count = lvalue;
+	} else if (hwloc__xml_verbose()) {
+	  fprintf(stderr, "%s: ignoring huge_page_free attribute for non-NUMAnode non-root object\n",
+		  state->global->msgprefix);
+	}
+      }
+      /* end of deprecated from 0.9 */
+      else goto unknown;
+    }
+    /* end of deprecated from 1.0 */
+    else goto unknown;
+  }
+  else {
+  unknown:
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring unknown object attribute %s\n",
+	      state->global->msgprefix, name);
+  }
+}
+
+
+static int
+hwloc__xml_import_info(struct hwloc_xml_backend_data_s *data,
+		       hwloc_obj_t obj,
+		       hwloc__xml_import_state_t state)
+{
+  char *infoname = NULL;
+  char *infovalue = NULL;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "name"))
+      infoname = attrvalue;
+    else if (!strcmp(attrname, "value"))
+      infovalue = attrvalue;
+    else
+      return -1;
+  }
+
+  if (infoname) {
+    /* empty strings are ignored by libxml */
+    if (data->version_major < 2 &&
+	(!strcmp(infoname, "Type") || !strcmp(infoname, "CoProcType"))) {
+      /* 1.x stored subtype in Type or CoProcType */
+      if (infovalue) {
+	if (obj->subtype)
+	  free(obj->subtype);
+	obj->subtype = strdup(infovalue);
+      }
+    } else {
+      if (infovalue)
+	hwloc_obj_add_info(obj, infoname, infovalue);
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_import_pagetype(hwloc_topology_t topology __hwloc_attribute_unused, struct hwloc_numanode_attr_s *memory,
+			   hwloc__xml_import_state_t state)
+{
+  uint64_t size = 0, count = 0;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "size"))
+      size = strtoull(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "count"))
+      count = strtoull(attrvalue, NULL, 10);
+    else
+      return -1;
+  }
+
+  if (size) {
+    unsigned idx = memory->page_types_len;
+    struct hwloc_memory_page_type_s *tmp;
+    tmp = realloc(memory->page_types, (idx+1)*sizeof(*memory->page_types));
+    if (tmp) { /* if failed to allocate, ignore this page_type entry */
+      memory->page_types = tmp;
+      memory->page_types_len = idx+1;
+      memory->page_types[idx].size = size;
+      memory->page_types[idx].count = count;
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_v1import_distances(struct hwloc_xml_backend_data_s *data,
+			      hwloc_obj_t obj,
+			      hwloc__xml_import_state_t state)
+{
+  unsigned long reldepth = 0, nbobjs = 0;
+  float latbase = 0;
+  char *tag;
+  int ret;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "nbobjs"))
+      nbobjs = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "relative_depth"))
+      reldepth = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "latency_base"))
+      latbase = (float) atof(attrvalue);
+    else
+      return -1;
+  }
+
+  if (nbobjs && reldepth && latbase) {
+    unsigned i;
+    float *matrix;
+    struct hwloc__xml_imported_v1distances_s *v1dist;
+
+    matrix = malloc(nbobjs*nbobjs*sizeof(float));
+    v1dist = malloc(sizeof(*v1dist));
+    if (!matrix || !v1dist) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: failed to allocate v1distance matrix for %lu objects\n",
+		state->global->msgprefix, nbobjs);
+      free(v1dist);
+      free(matrix);
+      return -1;
+    }
+
+    v1dist->kind = HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY;
+    /* TODO: we can't know for sure if it comes from the OS.
+     * On Linux/x86, it would be 10 on the diagonal.
+     * On Solaris/T5, 15 on the diagonal.
+     * Just check whether all values are integers, and that all values on the diagonal are minimal and identical?
+     */
+
+    v1dist->nbobjs = nbobjs;
+    v1dist->floats = matrix;
+
+    for(i=0; i<nbobjs*nbobjs; i++) {
+      struct hwloc__xml_import_state_s childstate;
+      char *attrname, *attrvalue;
+      float val;
+
+      ret = state->global->find_child(state, &childstate, &tag);
+      if (ret <= 0 || strcmp(tag, "latency")) {
+	/* a latency child is needed */
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      ret = state->global->next_attr(&childstate, &attrname, &attrvalue);
+      if (ret < 0 || strcmp(attrname, "value")) {
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      val = (float) atof((char *) attrvalue);
+      matrix[i] = val * latbase;
+
+      ret = state->global->close_tag(&childstate);
+      if (ret < 0) {
+	free(matrix);
+	free(v1dist);
+	return -1;
+      }
+
+      state->global->close_child(&childstate);
+    }
+
+    if (nbobjs < 2) {
+      /* distances with a single object are useless, even if the XML isn't invalid */
+      assert(nbobjs == 1);
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring invalid distance matrix with only 1 object\n",
+		state->global->msgprefix);
+      free(matrix);
+      free(v1dist);
+
+    } else if (obj->parent) {
+      /* we currently only import distances attached to root.
+       * we can't save obj in v1dist because obj could be dropped during insert if ignored.
+       * we could save its complete_cpu/nodeset instead to find it back later.
+       * but it doesn't matter much since only NUMA distances attached to root matter.
+       */
+      free(matrix);
+      free(v1dist);
+
+    } else {
+      /* queue the distance for real */
+      v1dist->prev = data->last_v1dist;
+      v1dist->next = NULL;
+      if (data->last_v1dist)
+	data->last_v1dist->next = v1dist;
+      else
+	data->first_v1dist = v1dist;
+      data->last_v1dist = v1dist;
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+static int
+hwloc__xml_import_userdata(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj,
+			   hwloc__xml_import_state_t state)
+{
+  size_t length = 0;
+  int encoded = 0;
+  char *name = NULL; /* optional */
+  int ret;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "length"))
+      length = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "encoding"))
+      encoded = !strcmp(attrvalue, "base64");
+    else if (!strcmp(attrname, "name"))
+      name = attrvalue;
+    else
+      return -1;
+  }
+
+  if (!topology->userdata_import_cb) {
+    char *buffer;
+    size_t reallength = encoded ? BASE64_ENCODED_LENGTH(length) : length;
+    ret = state->global->get_content(state, &buffer, reallength);
+    if (ret < 0)
+      return -1;
+
+  } else if (topology->userdata_not_decoded) {
+      char *buffer, *fakename;
+      size_t reallength = encoded ? BASE64_ENCODED_LENGTH(length) : length;
+      ret = state->global->get_content(state, &buffer, reallength);
+      if (ret < 0)
+        return -1;
+      fakename = malloc(6 + 1 + (name ? strlen(name) : 4) + 1);
+      if (!fakename)
+	return -1;
+      sprintf(fakename, encoded ? "base64%c%s" : "normal%c%s", name ? ':' : '-', name ? name : "anon");
+      topology->userdata_import_cb(topology, obj, fakename, buffer, length);
+      free(fakename);
+
+  } else if (encoded && length) {
+      char *encoded_buffer;
+      size_t encoded_length = BASE64_ENCODED_LENGTH(length);
+      ret = state->global->get_content(state, &encoded_buffer, encoded_length);
+      if (ret < 0)
+        return -1;
+      if (ret) {
+	char *decoded_buffer = malloc(length+1);
+	if (!decoded_buffer)
+	  return -1;
+	assert(encoded_buffer[encoded_length] == 0);
+	ret = hwloc_decode_from_base64(encoded_buffer, decoded_buffer, length+1);
+	if (ret != (int) length) {
+	  free(decoded_buffer);
+	  return -1;
+	}
+	topology->userdata_import_cb(topology, obj, name, decoded_buffer, length);
+	free(decoded_buffer);
+      }
+
+  } else { /* always handle length==0 in the non-encoded case */
+      char *buffer = (char *) "";
+      if (length) {
+	ret = state->global->get_content(state, &buffer, length);
+	if (ret < 0)
+	  return -1;
+      }
+      topology->userdata_import_cb(topology, obj, name, buffer, length);
+  }
+
+  state->global->close_content(state);
+  return state->global->close_tag(state);
+}
+
+static void hwloc__xml_import_report_outoforder(hwloc_topology_t topology, hwloc_obj_t new, hwloc_obj_t old)
+{
+  char *progname = hwloc_progname(topology);
+  const char *origversion = hwloc_obj_get_info_by_name(topology->levels[0][0], "hwlocVersion");
+  const char *origprogname = hwloc_obj_get_info_by_name(topology->levels[0][0], "ProcessName");
+  char *c1, *cc1, t1[64];
+  char *c2 = NULL, *cc2 = NULL, t2[64];
+
+  hwloc_bitmap_asprintf(&c1, new->cpuset);
+  hwloc_bitmap_asprintf(&cc1, new->complete_cpuset);
+  hwloc_obj_type_snprintf(t1, sizeof(t1), new, 0);
+
+  if (old->cpuset)
+    hwloc_bitmap_asprintf(&c2, old->cpuset);
+  if (old->complete_cpuset)
+    hwloc_bitmap_asprintf(&cc2, old->complete_cpuset);
+  hwloc_obj_type_snprintf(t2, sizeof(t2), old, 0);
+
+  fprintf(stderr, "****************************************************************************\n");
+  fprintf(stderr, "* hwloc has encountered an out-of-order XML topology load.\n");
+  fprintf(stderr, "* Object %s cpuset %s complete %s\n",
+	  t1, c1, cc1);
+  fprintf(stderr, "* was inserted after object %s with %s and %s.\n",
+	  t2, c2 ? c2 : "none", cc2 ? cc2 : "none");
+  fprintf(stderr, "* The error occured in hwloc %s inside process `%s', while\n",
+	  HWLOC_VERSION,
+	  progname ? progname : "<unknown>");
+  if (origversion || origprogname)
+    fprintf(stderr, "* the input XML was generated by hwloc %s inside process `%s'.\n",
+	    origversion ? origversion : "(unknown version)",
+	    origprogname ? origprogname : "<unknown>");
+  else
+    fprintf(stderr, "* the input XML was generated by an unspecified ancient hwloc release.\n");
+  fprintf(stderr, "* Please check that your input topology XML file is valid.\n");
+  fprintf(stderr, "* Set HWLOC_DEBUG_CHECK=1 in the environment to detect further issues.\n");
+  fprintf(stderr, "****************************************************************************\n");
+
+  free(c1);
+  free(cc1);
+  free(c2);
+  free(cc2);
+  free(progname);
+}
+
+static int
+hwloc__xml_import_object(hwloc_topology_t topology,
+			 struct hwloc_xml_backend_data_s *data,
+			 hwloc_obj_t parent, hwloc_obj_t obj, int *gotignored,
+			 hwloc__xml_import_state_t state)
+{
+  int ignored = 0;
+  int childrengotignored = 0;
+  int attribute_less_cache = 0;
+  int numa_was_root = 0;
+  char *tag;
+  struct hwloc__xml_import_state_s childstate;
+
+  /* set parent now since it's used during import below or in subfunctions */
+  obj->parent = parent;
+
+  /* process attributes */
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "type")) {
+      if (hwloc_type_sscanf(attrvalue, &obj->type, NULL, 0) < 0) {
+	if (!strcasecmp(attrvalue, "Cache")) {
+	  obj->type = _HWLOC_OBJ_CACHE_OLD; /* will be fixed below */
+	  attribute_less_cache = 1;
+	} else if (!strcasecmp(attrvalue, "System")) {
+	  if (!parent)
+	    obj->type = HWLOC_OBJ_MACHINE;
+	  else {
+	    if (hwloc__xml_verbose())
+	      fprintf(stderr, "%s: obsolete System object only allowed at root\n",
+		      state->global->msgprefix);
+	    goto error_with_object;
+	  }
+	} else if (!strcasecmp(attrvalue, "Tile")) {
+	  /* deal with possible future type */
+	  obj->type = HWLOC_OBJ_GROUP;
+	  obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_TILE;
+	} else if (!strcasecmp(attrvalue, "Module")) {
+	  /* deal with possible future type */
+	  obj->type = HWLOC_OBJ_GROUP;
+	  obj->attr->group.kind = HWLOC_GROUP_KIND_INTEL_MODULE;
+	} else if (!strcasecmp(attrvalue, "MemCache")) {
+	  /* ignore possible future type */
+	  obj->type = _HWLOC_OBJ_FUTURE;
+	  ignored = 1;
+	  if (hwloc__xml_verbose())
+	    fprintf(stderr, "%s: %s object not-supported, will be ignored\n",
+		    state->global->msgprefix, attrvalue);
+	} else {
+	  if (hwloc__xml_verbose())
+	    fprintf(stderr, "%s: unrecognized object type string %s\n",
+		    state->global->msgprefix, attrvalue);
+	  goto error_with_object;
+	}
+      }
+    } else {
+      /* type needed first */
+      if (obj->type == HWLOC_OBJ_TYPE_NONE) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: object attribute %s found before type\n",
+		  state->global->msgprefix,  attrname);
+	goto error_with_object;
+      }
+      hwloc__xml_import_object_attr(topology, data, obj, attrname, attrvalue, state);
+    }
+  }
+
+  /* process non-object subnodes to get info attrs (as well as page_types, etc) */
+  while (1) {
+    int ret;
+
+    tag = NULL;
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      goto error;
+    if (!ret)
+      break;
+
+    if (!strcmp(tag, "object")) {
+      /* we'll handle children later */
+      break;
+
+    } else if (!strcmp(tag, "page_type")) {
+      if (obj->type == HWLOC_OBJ_NUMANODE) {
+	ret = hwloc__xml_import_pagetype(topology, &obj->attr->numanode, &childstate);
+      } else if (!parent) {
+	ret = hwloc__xml_import_pagetype(topology, &topology->machine_memory, &childstate);
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid non-NUMAnode object child %s\n",
+		  state->global->msgprefix, tag);
+	ret = -1;
+      }
+
+    } else if (!strcmp(tag, "info")) {
+      ret = hwloc__xml_import_info(data, obj, &childstate);
+    } else if (data->version_major < 2 && !strcmp(tag, "distances")) {
+      ret = hwloc__xml_v1import_distances(data, obj, &childstate);
+    } else if (!strcmp(tag, "userdata")) {
+      ret = hwloc__xml_import_userdata(topology, obj, &childstate);
+    } else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid special object child %s\n",
+		state->global->msgprefix, tag);
+      ret = -1;
+    }
+
+    if (ret < 0)
+      goto error;
+
+    state->global->close_child(&childstate);
+  }
+
+  if (parent && obj->type == HWLOC_OBJ_MACHINE) {
+    /* replace non-root Machine with Groups */
+    obj->type = HWLOC_OBJ_GROUP;
+  }
+
+  if (parent && data->version_major >= 2) {
+    /* check parent/child types for 2.x */
+    if (hwloc__obj_type_is_normal(obj->type)) {
+      if (!hwloc__obj_type_is_normal(parent->type)) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "normal object %s cannot be child of non-normal parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_memory(obj->type)) {
+      if (hwloc__obj_type_is_io(parent->type) || HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "Memory object %s cannot be child of non-normal-or-memory parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_io(obj->type)) {
+      if (hwloc__obj_type_is_memory(parent->type) || HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "I/O object %s cannot be child of non-normal-or-I/O parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    }
+
+  } else if (parent && data->version_major < 2) {
+    /* check parent/child types for pre-v2.0 */
+    if (hwloc__obj_type_is_normal(obj->type) || HWLOC_OBJ_NUMANODE == obj->type) {
+      if (hwloc__obj_type_is_special(parent->type)) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "v1.x normal v1.x object %s cannot be child of special parent %s\n",
+		  hwloc_obj_type_string(obj->type), hwloc_obj_type_string(parent->type));
+	goto error_with_object;
+      }
+    } else if (hwloc__obj_type_is_io(obj->type)) {
+      if (HWLOC_OBJ_MISC == parent->type) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "I/O object %s cannot be child of Misc parent\n",
+		  hwloc_obj_type_string(obj->type));
+	goto error_with_object;
+      }
+    }
+  }
+
+  if (data->version_major < 2) {
+    /***************************
+     * 1.x specific checks
+     */
+
+    /* attach pre-v2.0 children of NUMA nodes to normal parent */
+    if (parent && parent->type == HWLOC_OBJ_NUMANODE) {
+      parent = parent->parent;
+      assert(parent);
+    }
+
+    /* insert a group above pre-v2.0 NUMA nodes if needed */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      if (!parent) {
+	/* crazy case of NUMA node root (only possible when filtering Machine keep_structure in v1.x),
+	 * reinsert a Machine object
+	 */
+	hwloc_obj_t machine = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MACHINE, HWLOC_UNKNOWN_INDEX);
+	machine->cpuset = hwloc_bitmap_dup(obj->cpuset);
+	machine->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	machine->nodeset = hwloc_bitmap_dup(obj->nodeset);
+	machine->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+	topology->levels[0][0] = machine;
+	parent = machine;
+	numa_was_root = 1;
+
+      } else if (!hwloc_bitmap_isequal(obj->complete_cpuset, parent->complete_cpuset)) {
+	/* This NUMA node has a different locality from its parent.
+	 * Don't attach it to this parent, or it well get its parent cpusets.
+	 * Add an intermediate Group with the desired locality.
+	 */
+	int needgroup = 1;
+	hwloc_obj_t sibling;
+
+	sibling = parent->memory_first_child;
+	if (sibling && !sibling->subtype
+	    && !sibling->next_sibling
+	    && obj->subtype && !strcmp(obj->subtype, "MCDRAM")
+	    && hwloc_bitmap_iszero(obj->complete_cpuset)) {
+	  /* this is KNL MCDRAM, we want to attach it near its DDR sibling */
+	  needgroup = 0;
+	}
+	/* Ideally we would also detect similar cases on future non-KNL platforms with multiple local NUMA nodes.
+	 * That's unlikely to occur with v1.x.
+	 * And we have no way to be sure if this CPU-less node is desired or not.
+	 */
+
+	if (needgroup
+	    && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
+	  hwloc_obj_t group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+	  group->gp_index = 0; /* will be initialized at the end of the discovery once we know the max */
+	  group->cpuset = hwloc_bitmap_dup(obj->cpuset);
+	  group->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	  group->nodeset = hwloc_bitmap_dup(obj->nodeset);
+	  group->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+	  group->attr->group.kind = HWLOC_GROUP_KIND_MEMORY;
+	  hwloc_insert_object_by_parent(topology, parent, group);
+	  parent = group;
+	}
+      }
+    }
+
+    /* fixup attribute-less caches imported from pre-v2.0 XMLs */
+    if (attribute_less_cache) {
+      assert(obj->type == _HWLOC_OBJ_CACHE_OLD);
+      obj->type = hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type);
+    }
+
+    /* fixup Misc objects inserted by cpusets in pre-v2.0 XMLs */
+    if (obj->type == HWLOC_OBJ_MISC && obj->cpuset)
+      obj->type = HWLOC_OBJ_GROUP;
+
+    /* check set consistency.
+     * 1.7.2 and earlier reported I/O Groups with only a cpuset, we don't want to reject those XMLs yet.
+     * Ignore those Groups since fixing the missing sets is hard (would need to look at children sets which are not available yet).
+     * Just abort the XML for non-Groups.
+     */
+    if (!obj->cpuset != !obj->complete_cpuset) {
+      /* has some cpuset without others */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with some missing cpusets\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    } else if (!obj->nodeset != !obj->complete_nodeset) {
+      /* has some nodeset without others */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with some missing nodesets\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    } else if (obj->nodeset && !obj->cpuset) {
+      /* has nodesets without cpusets (the contrary is allowed in pre-2.0) */
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	ignored = 1;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: invalid object %s P#%u with either cpuset or nodeset missing\n",
+		  state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+	goto error_with_object;
+      }
+    }
+    /* end of 1.x specific checks */
+  }
+
+  /* 2.0 backward compatibility */
+  if (obj->type == HWLOC_OBJ_GROUP) {
+    if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE
+	|| (obj->subtype && !strcmp(obj->subtype, "Die")))
+      obj->type = HWLOC_OBJ_DIE;
+  }
+
+  /* check that cache attributes are coherent with the actual type */
+  if (hwloc__obj_type_is_cache(obj->type)
+      && obj->type != hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid cache type %s with attribute depth %u and type %d\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->attr->cache.depth, (int) obj->attr->cache.type);
+    goto error_with_object;
+  }
+
+  /* check special types vs cpuset */
+  if (!obj->cpuset && !hwloc__obj_type_is_special(obj->type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid normal object %s P#%u without cpuset\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+  if (obj->cpuset && hwloc__obj_type_is_special(obj->type)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid special object %s with cpuset\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type));
+    goto error_with_object;
+  }
+
+  /* check parent vs child sets */
+  if (obj->cpuset && parent && !parent->cpuset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid object %s P#%u with cpuset while parent has none\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+  if (obj->nodeset && parent && !parent->nodeset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid object %s P#%u with nodeset while parent has none\n",
+	      state->global->msgprefix, hwloc_obj_type_string(obj->type), obj->os_index);
+    goto error_with_object;
+  }
+
+  /* check NUMA nodes */
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    if (!obj->nodeset) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid NUMA node object P#%u without nodeset\n",
+		state->global->msgprefix, obj->os_index);
+      goto error_with_object;
+    }
+    data->nbnumanodes++;
+    obj->prev_cousin = data->last_numanode;
+    obj->next_cousin = NULL;
+    if (data->last_numanode)
+      data->last_numanode->next_cousin = obj;
+    else
+      data->first_numanode = obj;
+    data->last_numanode = obj;
+  }
+
+  if (!hwloc_filter_check_keep_object(topology, obj)) {
+    /* Ignore this object instead of inserting it.
+     *
+     * Well, let the core ignore the root object later
+     * because we don't know yet if root has more than one child.
+     */
+    if (parent)
+      ignored = 1;
+  }
+
+  if (parent && !ignored) {
+    /* root->parent is NULL, and root is already inserted */
+    hwloc_insert_object_by_parent(topology, parent, obj);
+    /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+  }
+
+  /* process object subnodes, if we found one win the above loop */
+  while (tag) {
+    int ret;
+
+    if (!strcmp(tag, "object")) {
+      hwloc_obj_t childobj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_TYPE_MAX, HWLOC_UNKNOWN_INDEX);
+      childobj->parent = ignored ? parent : obj;
+      ret = hwloc__xml_import_object(topology, data, ignored ? parent : obj, childobj,
+				     &childrengotignored,
+				     &childstate);
+    } else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid special object child %s while looking for objects\n",
+		state->global->msgprefix, tag);
+      ret = -1;
+    }
+
+    if (ret < 0)
+      goto error;
+
+    state->global->close_child(&childstate);
+
+    tag = NULL;
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      goto error;
+    if (!ret)
+      break;
+  }
+
+  if (numa_was_root) {
+    /* duplicate NUMA infos to root, most of them are likely root-specific */
+    unsigned i;
+    for(i=0; i<obj->infos_count; i++) {
+      struct hwloc_info_s *info = &obj->infos[i];
+      hwloc_obj_add_info(parent, info->name, info->value);
+    }
+    /* TODO some infos are root-only (hwlocVersion, ProcessName, etc), remove them from obj? */
+  }
+
+  if (ignored) {
+    /* drop that object, and tell the parent that one child got ignored */
+    hwloc_free_unlinked_object(obj);
+    *gotignored = 1;
+
+  } else if (obj->first_child) {
+    /* now that all children are inserted, make sure they are in-order,
+     * so that the core doesn't have to deal with crappy children list.
+     */
+    hwloc_obj_t cur, next;
+    for(cur = obj->first_child, next = cur->next_sibling;
+	next;
+	cur = next, next = next->next_sibling) {
+      /* If reordering is needed, at least one pair of consecutive children will be out-of-order.
+       * So just check pairs of consecutive children.
+       *
+       * We checked above that complete_cpuset is always set.
+       */
+      if (hwloc_bitmap_compare_first(next->complete_cpuset, cur->complete_cpuset) < 0) {
+	/* next should be before cur */
+	if (!childrengotignored) {
+	  static int reported = 0;
+	  if (!reported && !hwloc_hide_errors()) {
+	    hwloc__xml_import_report_outoforder(topology, next, cur);
+	    reported = 1;
+	  }
+	}
+	hwloc__reorder_children(obj);
+	break;
+      }
+    }
+    /* no need to reorder memory children as long as there are no intermediate memory objects
+     * that could cause reordering when filtered-out.
+     */
+  }
+
+  return state->global->close_tag(state);
+
+ error_with_object:
+  if (parent)
+    /* root->parent is NULL, and root is already inserted. the caller will cleanup that root. */
+    hwloc_free_unlinked_object(obj);
+ error:
+  return -1;
+}
+
+static int
+hwloc__xml_v2import_distances(hwloc_topology_t topology,
+			      hwloc__xml_import_state_t state,
+			      int heterotypes)
+{
+  hwloc_obj_type_t unique_type = HWLOC_OBJ_TYPE_NONE;
+  hwloc_obj_type_t *different_types = NULL;
+  unsigned nbobjs = 0;
+  int indexing = heterotypes;
+  int os_indexing = 0;
+  int gp_indexing = heterotypes;
+  char *name = NULL;
+  unsigned long kind = 0;
+  unsigned nr_indexes, nr_u64values;
+  uint64_t *indexes;
+  uint64_t *u64values;
+  int ret;
+
+#define _TAG_NAME (heterotypes ? "distances2hetero" : "distances2")
+
+  /* process attributes */
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "nbobjs"))
+      nbobjs = strtoul(attrvalue, NULL, 10);
+    else if (!strcmp(attrname, "type")) {
+      if (hwloc_type_sscanf(attrvalue, &unique_type, NULL, 0) < 0) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: unrecognized %s type %s\n",
+		  state->global->msgprefix, _TAG_NAME, attrvalue);
+	goto out;
+      }
+    }
+    else if (!strcmp(attrname, "indexing")) {
+      indexing = 1;
+      if (!strcmp(attrvalue, "os"))
+	os_indexing = 1;
+      else if (!strcmp(attrvalue, "gp"))
+	gp_indexing = 1;
+    }
+    else if (!strcmp(attrname, "kind")) {
+      kind = strtoul(attrvalue, NULL, 10);
+    }
+    else if (!strcmp(attrname, "name")) {
+      name = attrvalue;
+    }
+    else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring unknown %s attribute %s\n",
+		state->global->msgprefix, _TAG_NAME, attrname);
+    }
+  }
+
+  /* abort if missing attribute */
+  if (!nbobjs || (!heterotypes && unique_type == HWLOC_OBJ_TYPE_NONE) || !indexing || !kind) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: %s missing some attributes\n",
+	      state->global->msgprefix, _TAG_NAME);
+    goto out;
+  }
+
+  indexes = malloc(nbobjs*sizeof(*indexes));
+  u64values = malloc(nbobjs*nbobjs*sizeof(*u64values));
+  if (heterotypes)
+    different_types = malloc(nbobjs*sizeof(*different_types));
+  if (!indexes || !u64values || (heterotypes && !different_types)) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: failed to allocate %s arrays for %u objects\n",
+	      state->global->msgprefix, _TAG_NAME, nbobjs);
+    goto out_with_arrays;
+  }
+
+  /* process children */
+  nr_indexes = 0;
+  nr_u64values = 0;
+  while (1) {
+    struct hwloc__xml_import_state_s childstate;
+    char *attrname, *attrvalue, *tag, *buffer;
+    int length;
+    int is_index = 0;
+    int is_u64values = 0;
+
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret <= 0)
+      break;
+
+    if (!strcmp(tag, "indexes"))
+      is_index = 1;
+    else if (!strcmp(tag, "u64values"))
+      is_u64values = 1;
+    if (!is_index && !is_u64values) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: %s with unrecognized child %s\n",
+		state->global->msgprefix, _TAG_NAME, tag);
+      goto out_with_arrays;
+    }
+
+    if (state->global->next_attr(&childstate, &attrname, &attrvalue) < 0
+	|| strcmp(attrname, "length")) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: %s child must have length attribute\n",
+		state->global->msgprefix, _TAG_NAME);
+      goto out_with_arrays;
+    }
+    length = atoi(attrvalue);
+
+    ret = state->global->get_content(&childstate, &buffer, length);
+    if (ret < 0) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: %s child needs content of length %d\n",
+		state->global->msgprefix, _TAG_NAME, length);
+      goto out_with_arrays;
+    }
+
+    if (is_index) {
+      /* get indexes */
+      char *tmp, *tmp2;
+      if (nr_indexes >= nbobjs) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: %s with more than %u indexes\n",
+		  state->global->msgprefix, _TAG_NAME, nbobjs);
+	goto out_with_arrays;
+      }
+      tmp = buffer;
+      while (1) {
+	char *next;
+	unsigned long long u;
+	if (heterotypes) {
+	  hwloc_obj_type_t t = HWLOC_OBJ_TYPE_NONE;
+	  if (hwloc_type_sscanf(tmp, &t, NULL, 0) < 0) {
+	    if (hwloc__xml_verbose())
+	      fprintf(stderr, "%s: %s with unrecognized heterogeneous type %s\n",
+		      state->global->msgprefix, _TAG_NAME, tmp);
+	    goto out_with_arrays;
+	  }
+	  tmp2 = strchr(tmp, ':');
+	  if (!tmp2) {
+	    if (hwloc__xml_verbose())
+	      fprintf(stderr, "%s: %s with missing colon after heterogeneous type %s\n",
+		      state->global->msgprefix, _TAG_NAME, tmp);
+	    goto out_with_arrays;
+	  }
+	  tmp = tmp2+1;
+	  different_types[nr_indexes] = t;
+	}
+	u = strtoull(tmp, &next, 0);
+	if (next == tmp)
+	  break;
+	indexes[nr_indexes++] = u;
+	if (*next != ' ')
+	  break;
+	if (nr_indexes == nbobjs)
+	  break;
+	tmp = next+1;
+      }
+
+    } else if (is_u64values) {
+      /* get uint64_t values */
+      char *tmp;
+      if (nr_u64values >= nbobjs*nbobjs) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: %s with more than %u u64values\n",
+		  state->global->msgprefix, _TAG_NAME, nbobjs*nbobjs);
+	goto out_with_arrays;
+      }
+      tmp = buffer;
+      while (1) {
+	char *next;
+	unsigned long long u = strtoull(tmp, &next, 0);
+	if (next == tmp)
+	  break;
+	u64values[nr_u64values++] = u;
+	if (*next != ' ')
+	  break;
+	if (nr_u64values == nbobjs*nbobjs)
+	  break;
+	tmp = next+1;
+      }
+    }
+
+    state->global->close_content(&childstate);
+
+    ret = state->global->close_tag(&childstate);
+    if (ret < 0) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: %s with more than %u indexes\n",
+		state->global->msgprefix, _TAG_NAME, nbobjs);
+      goto out_with_arrays;
+    }
+
+    state->global->close_child(&childstate);
+  }
+
+  if (nr_indexes != nbobjs) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: %s with less than %u indexes\n",
+	      state->global->msgprefix, _TAG_NAME, nbobjs);
+    goto out_with_arrays;
+  }
+  if (nr_u64values != nbobjs*nbobjs) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: %s with less than %u u64values\n",
+	      state->global->msgprefix, _TAG_NAME, nbobjs*nbobjs);
+    goto out_with_arrays;
+  }
+
+  if (nbobjs < 2) {
+    /* distances with a single object are useless, even if the XML isn't invalid */
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: ignoring %s with only %u objects\n",
+	      state->global->msgprefix, _TAG_NAME, nbobjs);
+    goto out_ignore;
+  }
+  if (unique_type == HWLOC_OBJ_PU || unique_type == HWLOC_OBJ_NUMANODE) {
+    if (!os_indexing) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring PU or NUMA %s without os_indexing\n",
+		state->global->msgprefix, _TAG_NAME);
+      goto out_ignore;
+    }
+  } else {
+    if (!gp_indexing) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring !PU or !NUMA %s without gp_indexing\n",
+		state->global->msgprefix, _TAG_NAME);
+      goto out_ignore;
+    }
+  }
+
+  hwloc_internal_distances_add_by_index(topology, name, unique_type, different_types, nbobjs, indexes, u64values, kind, 0);
+
+  /* prevent freeing below */
+  indexes = NULL;
+  u64values = NULL;
+  different_types = NULL;
+
+ out_ignore:
+  free(different_types);
+  free(indexes);
+  free(u64values);
+  return state->global->close_tag(state);
+
+ out_with_arrays:
+  free(different_types);
+  free(indexes);
+  free(u64values);
+ out:
+  return -1;
+#undef _TAG_NAME
+}
+
+static int
+hwloc__xml_import_diff_one(hwloc__xml_import_state_t state,
+			   hwloc_topology_diff_t *firstdiffp,
+			   hwloc_topology_diff_t *lastdiffp)
+{
+  char *type_s = NULL;
+  char *obj_depth_s = NULL;
+  char *obj_index_s = NULL;
+  char *obj_attr_type_s = NULL;
+/* char *obj_attr_index_s = NULL; unused for now */
+  char *obj_attr_name_s = NULL;
+  char *obj_attr_oldvalue_s = NULL;
+  char *obj_attr_newvalue_s = NULL;
+
+  while (1) {
+    char *attrname, *attrvalue;
+    if (state->global->next_attr(state, &attrname, &attrvalue) < 0)
+      break;
+    if (!strcmp(attrname, "type"))
+      type_s = attrvalue;
+    else if (!strcmp(attrname, "obj_depth"))
+      obj_depth_s = attrvalue;
+    else if (!strcmp(attrname, "obj_index"))
+      obj_index_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_type"))
+      obj_attr_type_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_index"))
+      { /* obj_attr_index_s = attrvalue; unused for now */ }
+    else if (!strcmp(attrname, "obj_attr_name"))
+      obj_attr_name_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_oldvalue"))
+      obj_attr_oldvalue_s = attrvalue;
+    else if (!strcmp(attrname, "obj_attr_newvalue"))
+      obj_attr_newvalue_s = attrvalue;
+    else {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: ignoring unknown diff attribute %s\n",
+		state->global->msgprefix, attrname);
+      return -1;
+    }
+  }
+
+  if (type_s) {
+    switch (atoi(type_s)) {
+    default:
+      break;
+    case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+      /* object attribute diff */
+      hwloc_topology_diff_obj_attr_type_t obj_attr_type;
+      hwloc_topology_diff_t diff;
+
+      /* obj_attr mandatory generic attributes */
+      if (!obj_depth_s || !obj_index_s || !obj_attr_type_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr generic attributes\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* obj_attr mandatory attributes common to all subtypes */
+      if (!obj_attr_oldvalue_s || !obj_attr_newvalue_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr value attributes\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* mandatory attributes for obj_attr_info subtype */
+      obj_attr_type = atoi(obj_attr_type_s);
+      if (obj_attr_type == HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO && !obj_attr_name_s) {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: missing mandatory obj attr info name attribute\n",
+		  state->global->msgprefix);
+	break;
+      }
+
+      /* now we know we have everything we need */
+      diff = malloc(sizeof(*diff));
+      if (!diff)
+	return -1;
+      diff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+      diff->obj_attr.obj_depth = atoi(obj_depth_s);
+      diff->obj_attr.obj_index = atoi(obj_index_s);
+      memset(&diff->obj_attr.diff, 0, sizeof(diff->obj_attr.diff));
+      diff->obj_attr.diff.generic.type = obj_attr_type;
+
+      switch (atoi(obj_attr_type_s)) {
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE:
+	diff->obj_attr.diff.uint64.oldvalue = strtoull(obj_attr_oldvalue_s, NULL, 0);
+	diff->obj_attr.diff.uint64.newvalue = strtoull(obj_attr_newvalue_s, NULL, 0);
+	break;
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+	diff->obj_attr.diff.string.name = strdup(obj_attr_name_s);
+	/* FALLTHRU */
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+	diff->obj_attr.diff.string.oldvalue = strdup(obj_attr_oldvalue_s);
+	diff->obj_attr.diff.string.newvalue = strdup(obj_attr_newvalue_s);
+	break;
+      }
+
+      if (*firstdiffp)
+	(*lastdiffp)->generic.next = diff;
+      else
+        *firstdiffp = diff;
+      *lastdiffp = diff;
+      diff->generic.next = NULL;
+    }
+    }
+  }
+
+  return state->global->close_tag(state);
+}
+
+int
+hwloc__xml_import_diff(hwloc__xml_import_state_t state,
+		       hwloc_topology_diff_t *firstdiffp)
+{
+  hwloc_topology_diff_t firstdiff = NULL, lastdiff = NULL;
+  *firstdiffp = NULL;
+
+  while (1) {
+    struct hwloc__xml_import_state_s childstate;
+    char *tag;
+    int ret;
+
+    ret = state->global->find_child(state, &childstate, &tag);
+    if (ret < 0)
+      return -1;
+    if (!ret)
+      break;
+
+    if (!strcmp(tag, "diff")) {
+      ret = hwloc__xml_import_diff_one(&childstate, &firstdiff, &lastdiff);
+    } else
+      ret = -1;
+
+    if (ret < 0)
+      return ret;
+
+    state->global->close_child(&childstate);
+  }
+
+  *firstdiffp = firstdiff;
+  return 0;
+}
+
+/***********************************
+ ********* main XML import *********
+ ***********************************/
+
+static void
+hwloc_convert_from_v1dist_floats(hwloc_topology_t topology, unsigned nbobjs, float *floats, uint64_t *u64s)
+{
+  unsigned i;
+  int is_uint;
+  char *env;
+  float scale = 1000.f;
+  char scalestring[20];
+
+  env = getenv("HWLOC_XML_V1DIST_SCALE");
+  if (env) {
+    scale = (float) atof(env);
+    goto scale;
+  }
+
+  is_uint = 1;
+  /* find out if all values are integers */
+  for(i=0; i<nbobjs*nbobjs; i++) {
+    float f, iptr, fptr;
+    f = floats[i];
+    if (f < 0.f) {
+      is_uint = 0;
+      break;
+    }
+    fptr = modff(f, &iptr);
+    if (fptr > .001f && fptr < .999f) {
+      is_uint = 0;
+      break;
+    }
+    u64s[i] = (int)(f+.5f);
+  }
+  if (is_uint)
+    return;
+
+ scale:
+  /* TODO heuristic to find a good scale */
+  for(i=0; i<nbobjs*nbobjs; i++)
+    u64s[i] = (uint64_t)(scale * floats[i]);
+
+  /* save the scale in root info attrs.
+   * Not perfect since we may have multiple of them,
+   * and some distances might disappear in case of restrict, etc.
+   */
+  sprintf(scalestring, "%f", scale);
+  hwloc_obj_add_info(hwloc_get_root_obj(topology), "xmlv1DistancesScale", scalestring);
+}
+
+/* this canNOT be the first XML call */
+static int
+hwloc_look_xml(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
+{
+  /*
+   * This backend enforces !topology->is_thissystem by default.
+   */
+
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_xml_backend_data_s *data = backend->private_data;
+  struct hwloc__xml_import_state_s state, childstate;
+  struct hwloc_obj *root = topology->levels[0][0];
+  char *tag;
+  int gotignored = 0;
+  hwloc_localeswitch_declare;
+  int ret;
+
+  assert(dstatus->phase == HWLOC_DISC_PHASE_GLOBAL);
+
+  state.global = data;
+
+  assert(!root->cpuset);
+
+  hwloc_localeswitch_init();
+
+  data->nbnumanodes = 0;
+  data->first_numanode = data->last_numanode = NULL;
+  data->first_v1dist = data->last_v1dist = NULL;
+
+  ret = data->look_init(data, &state);
+  if (ret < 0)
+    goto failed;
+
+  if (data->version_major > 2) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: cannot import XML version %u.%u > 2\n",
+	      data->msgprefix, data->version_major, data->version_minor);
+    goto err;
+  }
+
+  /* find root object tag and import it */
+  ret = state.global->find_child(&state, &childstate, &tag);
+  if (ret < 0 || !ret || strcmp(tag, "object"))
+    goto failed;
+  ret = hwloc__xml_import_object(topology, data, NULL /*  no parent */, root,
+				 &gotignored,
+				 &childstate);
+  if (ret < 0)
+    goto failed;
+  state.global->close_child(&childstate);
+  assert(!gotignored);
+
+  /* the root may have changed if we had to reinsert a Machine */
+  root = topology->levels[0][0];
+
+  if (data->version_major >= 2) {
+    /* find v2 distances */
+    while (1) {
+      ret = state.global->find_child(&state, &childstate, &tag);
+      if (ret < 0)
+	goto failed;
+      if (!ret)
+	break;
+      if (!strcmp(tag, "distances2")) {
+	ret = hwloc__xml_v2import_distances(topology, &childstate, 0);
+	if (ret < 0)
+	  goto failed;
+      } else if (!strcmp(tag, "distances2hetero")) {
+	ret = hwloc__xml_v2import_distances(topology, &childstate, 1);
+	if (ret < 0)
+	  goto failed;
+      } else {
+	if (hwloc__xml_verbose())
+	  fprintf(stderr, "%s: ignoring unknown tag `%s' after root object, expected `distances2'\n",
+		  data->msgprefix, tag);
+	goto done;
+      }
+      state.global->close_child(&childstate);
+    }
+  }
+
+  /* find end of topology tag */
+  state.global->close_tag(&state);
+
+done:
+  if (!root->cpuset) {
+    if (hwloc__xml_verbose())
+      fprintf(stderr, "%s: invalid root object without cpuset\n",
+	      data->msgprefix);
+    goto err;
+  }
+
+  /* update pre-v2.0 memory group gp_index */
+  if (data->version_major < 2 && data->first_numanode) {
+    hwloc_obj_t node = data->first_numanode;
+    do {
+      if (node->parent->type == HWLOC_OBJ_GROUP
+	  && !node->parent->gp_index)
+	node->parent->gp_index = topology->next_gp_index++;
+      node = node->next_cousin;
+    } while (node);
+  }
+
+  if (data->version_major < 2 && data->first_v1dist) {
+    /* handle v1 distances */
+    struct hwloc__xml_imported_v1distances_s *v1dist, *v1next = data->first_v1dist;
+    while ((v1dist = v1next) != NULL) {
+      unsigned nbobjs = v1dist->nbobjs;
+      v1next = v1dist->next;
+      /* Handle distances as NUMA node distances if nbobjs matches.
+       * Otherwise drop, only NUMA distances really matter.
+       *
+       * We could also attach to a random level with the right nbobjs,
+       * but it would require to have those objects in the original XML order (like the first_numanode cousin-list).
+       * because the topology order can be different if some parents are ignored during load.
+       */
+      if (nbobjs == data->nbnumanodes) {
+	hwloc_obj_t *objs = malloc(nbobjs*sizeof(hwloc_obj_t));
+	uint64_t *values = malloc(nbobjs*nbobjs*sizeof(*values));
+	if (objs && values) {
+	  hwloc_obj_t node;
+	  unsigned i;
+	  for(i=0, node = data->first_numanode;
+	      i<nbobjs;
+	      i++, node = node->next_cousin)
+	    objs[i] = node;
+	  hwloc_convert_from_v1dist_floats(topology, nbobjs, v1dist->floats, values);
+	  hwloc_internal_distances_add(topology, NULL, nbobjs, objs, values, v1dist->kind, 0);
+	} else {
+	  free(objs);
+	  free(values);
+	}
+      }
+      free(v1dist->floats);
+      free(v1dist);
+    }
+    data->first_v1dist = data->last_v1dist = NULL;
+  }
+
+  /* FIXME:
+   * We should check that the existing object sets are consistent:
+   * no intersection between objects of a same level,
+   * object sets included in parent sets.
+   * hwloc never generated such buggy XML, but users could create one.
+   *
+   * We want to add these checks to the existing core code that
+   * adds missing sets and propagates parent/children sets
+   * (in case another backend ever generates buggy object sets as well).
+   */
+
+  if (data->version_major >= 2) {
+    /* v2 must have non-empty nodesets since at least one NUMA node is required */
+    if (!root->nodeset) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid root object without nodeset\n",
+		data->msgprefix);
+      goto err;
+    }
+    if (hwloc_bitmap_iszero(root->nodeset)) {
+      if (hwloc__xml_verbose())
+	fprintf(stderr, "%s: invalid root object with empty nodeset\n",
+		data->msgprefix);
+      goto err;
+    }
+  } else {
+    /* if v1 without nodeset, the core will add a default NUMA node and nodesets */
+  }
+
+  /* allocate default cpusets and nodesets if missing, the core will restrict them */
+  hwloc_alloc_root_sets(root);
+
+  /* keep the "Backend" information intact */
+  /* we could add "BackendSource=XML" to notify that XML was used between the actual backend and here */
+
+  topology->support.discovery->pu = 1;
+  topology->support.discovery->disallowed_pu = 1;
+  if (data->nbnumanodes) {
+    topology->support.discovery->numa = 1;
+    topology->support.discovery->numa_memory = 1; // FIXME
+    topology->support.discovery->disallowed_numa = 1;
+  }
+
+  if (data->look_done)
+    data->look_done(data, 0);
+
+  hwloc_localeswitch_fini();
+  return 0;
+
+ failed:
+  if (data->look_done)
+    data->look_done(data, -1);
+  if (hwloc__xml_verbose())
+    fprintf(stderr, "%s: XML component discovery failed.\n",
+	    data->msgprefix);
+ err:
+  hwloc_free_object_siblings_and_children(root->first_child);
+  root->first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->memory_first_child);
+  root->memory_first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->io_first_child);
+  root->io_first_child = NULL;
+  hwloc_free_object_siblings_and_children(root->misc_first_child);
+  root->misc_first_child = NULL;
+
+  /* make sure the core will abort */
+  if (root->cpuset)
+    hwloc_bitmap_zero(root->cpuset);
+  if (root->nodeset)
+    hwloc_bitmap_zero(root->nodeset);
+
+  hwloc_localeswitch_fini();
+  return -1;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_load_xml(const char *xmlpath,
+			     hwloc_topology_diff_t *firstdiffp, char **refnamep)
+{
+  struct hwloc__xml_import_state_s state;
+  struct hwloc_xml_backend_data_s fakedata; /* only for storing global info during parsing */
+  hwloc_localeswitch_declare;
+  const char *local_basename;
+  int force_nolibxml;
+  int ret;
+
+  state.global = &fakedata;
+
+  local_basename = strrchr(xmlpath, '/');
+  if (local_basename)
+    local_basename++;
+  else
+    local_basename = xmlpath;
+  fakedata.msgprefix = strdup(local_basename);
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  *firstdiffp = NULL;
+
+  force_nolibxml = hwloc_nolibxml_import();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->import_diff(&state, xmlpath, NULL, 0, firstdiffp, refnamep);
+  else {
+    ret = hwloc_libxml_callbacks->import_diff(&state, xmlpath, NULL, 0, firstdiffp, refnamep);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  free(fakedata.msgprefix);
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen,
+				   hwloc_topology_diff_t *firstdiffp, char **refnamep)
+{
+  struct hwloc__xml_import_state_s state;
+  struct hwloc_xml_backend_data_s fakedata; /* only for storing global info during parsing */
+  hwloc_localeswitch_declare;
+  int force_nolibxml;
+  int ret;
+
+  state.global = &fakedata;
+  fakedata.msgprefix = strdup("xmldiffbuffer");
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  *firstdiffp = NULL;
+
+  force_nolibxml = hwloc_nolibxml_import();
+ retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->import_diff(&state, NULL, xmlbuffer, buflen, firstdiffp, refnamep);
+  else {
+    ret = hwloc_libxml_callbacks->import_diff(&state, NULL, xmlbuffer, buflen, firstdiffp, refnamep);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  free(fakedata.msgprefix);
+  return ret;
+}
+
+/************************************************
+ ********* XML export (common routines) *********
+ ************************************************/
+
+#define HWLOC_XML_CHAR_VALID(c) (((c) >= 32 && (c) <= 126) || (c) == '\t' || (c) == '\n' || (c) == '\r')
+
+static int
+hwloc__xml_export_check_buffer(const char *buf, size_t length)
+{
+  unsigned i;
+  for(i=0; i<length; i++)
+    if (!HWLOC_XML_CHAR_VALID(buf[i]))
+      return -1;
+  return 0;
+}
+
+/* strdup and remove ugly chars from random string */
+static char*
+hwloc__xml_export_safestrdup(const char *old)
+{
+  char *new = malloc(strlen(old)+1);
+  char *dst = new;
+  const char *src = old;
+  if (!new)
+    return NULL;
+
+  while (*src) {
+    if (HWLOC_XML_CHAR_VALID(*src))
+      *(dst++) = *src;
+    src++;
+  }
+  *dst = '\0';
+  return new;
+}
+
+static void
+hwloc__xml_export_object_contents (hwloc__xml_export_state_t state, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  char *setstring = NULL, *setstring2 = NULL;
+  char tmp[255];
+  int v1export = flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1;
+  unsigned i,j;
+
+  if (v1export && obj->type == HWLOC_OBJ_PACKAGE)
+    state->new_prop(state, "type", "Socket");
+  else if (v1export && obj->type == HWLOC_OBJ_DIE)
+    state->new_prop(state, "type", "Group");
+  else if (v1export && hwloc__obj_type_is_cache(obj->type))
+    state->new_prop(state, "type", "Cache");
+  else
+    state->new_prop(state, "type", hwloc_obj_type_string(obj->type));
+
+  if (obj->os_index != HWLOC_UNKNOWN_INDEX) {
+    sprintf(tmp, "%u", obj->os_index);
+    state->new_prop(state, "os_index", tmp);
+  }
+
+  if (obj->cpuset) {
+    int empty_cpusets = 0;
+
+    if (v1export && obj->type == HWLOC_OBJ_NUMANODE) {
+      /* walk up this memory hierarchy to find-out if we are the first numa node.
+       * v1 non-first NUMA nodes have empty cpusets.
+       */
+      hwloc_obj_t parent = obj;
+      while (!hwloc_obj_type_is_normal(parent->type)) {
+	if (parent->sibling_rank > 0) {
+	  empty_cpusets = 1;
+	  break;
+	}
+	parent = parent->parent;
+      }
+    }
+
+    if (empty_cpusets) {
+      state->new_prop(state, "cpuset", "0x0");
+      state->new_prop(state, "online_cpuset", "0x0");
+      state->new_prop(state, "complete_cpuset", "0x0");
+      state->new_prop(state, "allowed_cpuset", "0x0");
+
+    } else {
+      /* normal case */
+      hwloc_bitmap_asprintf(&setstring, obj->cpuset);
+      state->new_prop(state, "cpuset", setstring);
+
+      hwloc_bitmap_asprintf(&setstring2, obj->complete_cpuset);
+      state->new_prop(state, "complete_cpuset", setstring2);
+      free(setstring2);
+
+      if (v1export)
+	state->new_prop(state, "online_cpuset", setstring);
+      free(setstring);
+
+      if (v1export || !obj->parent) {
+	hwloc_bitmap_t allowed_cpuset = hwloc_bitmap_dup(obj->cpuset);
+	hwloc_bitmap_and(allowed_cpuset, allowed_cpuset, topology->allowed_cpuset);
+	hwloc_bitmap_asprintf(&setstring, allowed_cpuset);
+	state->new_prop(state, "allowed_cpuset", setstring);
+	free(setstring);
+	hwloc_bitmap_free(allowed_cpuset);
+      }
+    }
+
+    /* If exporting v1, we should clear second local NUMA bits from nodeset,
+     * but the importer will clear them anyway.
+     */
+    hwloc_bitmap_asprintf(&setstring, obj->nodeset);
+    state->new_prop(state, "nodeset", setstring);
+    free(setstring);
+
+    hwloc_bitmap_asprintf(&setstring, obj->complete_nodeset);
+    state->new_prop(state, "complete_nodeset", setstring);
+    free(setstring);
+
+    if (v1export || !obj->parent) {
+      hwloc_bitmap_t allowed_nodeset = hwloc_bitmap_dup(obj->nodeset);
+      hwloc_bitmap_and(allowed_nodeset, allowed_nodeset, topology->allowed_nodeset);
+      hwloc_bitmap_asprintf(&setstring, allowed_nodeset);
+      state->new_prop(state, "allowed_nodeset", setstring);
+      free(setstring);
+      hwloc_bitmap_free(allowed_nodeset);
+    }
+  }
+
+  if (!v1export) {
+    sprintf(tmp, "%llu", (unsigned long long) obj->gp_index);
+    state->new_prop(state, "gp_index", tmp);
+  }
+
+  if (obj->name) {
+    char *name = hwloc__xml_export_safestrdup(obj->name);
+    if (name) {
+      state->new_prop(state, "name", name);
+      free(name);
+    }
+  }
+  if (!v1export && obj->subtype) {
+    char *subtype = hwloc__xml_export_safestrdup(obj->subtype);
+    if (subtype) {
+      state->new_prop(state, "subtype", subtype);
+      free(subtype);
+    }
+  }
+
+  switch (obj->type) {
+  case HWLOC_OBJ_NUMANODE:
+    if (obj->attr->numanode.local_memory) {
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.local_memory);
+      state->new_prop(state, "local_memory", tmp);
+    }
+    for(i=0; i<obj->attr->numanode.page_types_len; i++) {
+      struct hwloc__xml_export_state_s childstate;
+      state->new_child(state, &childstate, "page_type");
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.page_types[i].size);
+      childstate.new_prop(&childstate, "size", tmp);
+      sprintf(tmp, "%llu", (unsigned long long) obj->attr->numanode.page_types[i].count);
+      childstate.new_prop(&childstate, "count", tmp);
+      childstate.end_object(&childstate, "page_type");
+    }
+    break;
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+  case HWLOC_OBJ_MEMCACHE:
+    sprintf(tmp, "%llu", (unsigned long long) obj->attr->cache.size);
+    state->new_prop(state, "cache_size", tmp);
+    sprintf(tmp, "%u", obj->attr->cache.depth);
+    state->new_prop(state, "depth", tmp);
+    sprintf(tmp, "%u", (unsigned) obj->attr->cache.linesize);
+    state->new_prop(state, "cache_linesize", tmp);
+    sprintf(tmp, "%d", obj->attr->cache.associativity);
+    state->new_prop(state, "cache_associativity", tmp);
+    sprintf(tmp, "%d", (int) obj->attr->cache.type);
+    state->new_prop(state, "cache_type", tmp);
+    break;
+  case HWLOC_OBJ_GROUP:
+    if (v1export) {
+      sprintf(tmp, "%u", obj->attr->group.depth);
+      state->new_prop(state, "depth", tmp);
+      if (obj->attr->group.dont_merge)
+        state->new_prop(state, "dont_merge", "1");
+    } else {
+      sprintf(tmp, "%u", obj->attr->group.kind);
+      state->new_prop(state, "kind", tmp);
+      sprintf(tmp, "%u", obj->attr->group.subkind);
+      state->new_prop(state, "subkind", tmp);
+      if (obj->attr->group.dont_merge)
+        state->new_prop(state, "dont_merge", "1");
+    }
+    break;
+  case HWLOC_OBJ_BRIDGE:
+    sprintf(tmp, "%d-%d", (int) obj->attr->bridge.upstream_type, (int) obj->attr->bridge.downstream_type);
+    state->new_prop(state, "bridge_type", tmp);
+    sprintf(tmp, "%u", obj->attr->bridge.depth);
+    state->new_prop(state, "depth", tmp);
+    if (obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+      sprintf(tmp, "%04x:[%02x-%02x]",
+	      (unsigned) obj->attr->bridge.downstream.pci.domain,
+	      (unsigned) obj->attr->bridge.downstream.pci.secondary_bus,
+	      (unsigned) obj->attr->bridge.downstream.pci.subordinate_bus);
+      state->new_prop(state, "bridge_pci", tmp);
+    }
+    if (obj->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI)
+      break;
+    /* FALLTHRU */
+  case HWLOC_OBJ_PCI_DEVICE:
+    sprintf(tmp, "%04x:%02x:%02x.%01x",
+	    (unsigned) obj->attr->pcidev.domain,
+	    (unsigned) obj->attr->pcidev.bus,
+	    (unsigned) obj->attr->pcidev.dev,
+	    (unsigned) obj->attr->pcidev.func);
+    state->new_prop(state, "pci_busid", tmp);
+    sprintf(tmp, "%04x [%04x:%04x] [%04x:%04x] %02x",
+	    (unsigned) obj->attr->pcidev.class_id,
+	    (unsigned) obj->attr->pcidev.vendor_id, (unsigned) obj->attr->pcidev.device_id,
+	    (unsigned) obj->attr->pcidev.subvendor_id, (unsigned) obj->attr->pcidev.subdevice_id,
+	    (unsigned) obj->attr->pcidev.revision);
+    state->new_prop(state, "pci_type", tmp);
+    sprintf(tmp, "%f", obj->attr->pcidev.linkspeed);
+    state->new_prop(state, "pci_link_speed", tmp);
+    break;
+  case HWLOC_OBJ_OS_DEVICE:
+    sprintf(tmp, "%d", (int) obj->attr->osdev.type);
+    state->new_prop(state, "osdev_type", tmp);
+    break;
+  default:
+    break;
+  }
+
+  for(i=0; i<obj->infos_count; i++) {
+    char *name = hwloc__xml_export_safestrdup(obj->infos[i].name);
+    char *value = hwloc__xml_export_safestrdup(obj->infos[i].value);
+    if (name && value) {
+      struct hwloc__xml_export_state_s childstate;
+      state->new_child(state, &childstate, "info");
+      childstate.new_prop(&childstate, "name", name);
+      childstate.new_prop(&childstate, "value", value);
+      childstate.end_object(&childstate, "info");
+    }
+    free(name);
+    free(value);
+  }
+  if (v1export && obj->subtype) {
+    char *subtype = hwloc__xml_export_safestrdup(obj->subtype);
+    if (subtype) {
+      struct hwloc__xml_export_state_s childstate;
+      int is_coproctype = (obj->type == HWLOC_OBJ_OS_DEVICE && obj->attr->osdev.type == HWLOC_OBJ_OSDEV_COPROC);
+      state->new_child(state, &childstate, "info");
+      childstate.new_prop(&childstate, "name", is_coproctype ? "CoProcType" : "Type");
+      childstate.new_prop(&childstate, "value", subtype);
+      childstate.end_object(&childstate, "info");
+      free(subtype);
+    }
+  }
+  if (v1export && obj->type == HWLOC_OBJ_DIE) {
+    struct hwloc__xml_export_state_s childstate;
+    state->new_child(state, &childstate, "info");
+    childstate.new_prop(&childstate, "name", "Type");
+    childstate.new_prop(&childstate, "value", "Die");
+    childstate.end_object(&childstate, "info");
+  }
+
+  if (v1export && !obj->parent) {
+    /* only latency matrices covering the entire machine can be exported to v1 */
+    struct hwloc_internal_distances_s *dist;
+    /* refresh distances since we need objects below */
+    hwloc_internal_distances_refresh(topology);
+    for(dist = topology->first_dist; dist; dist = dist->next) {
+      struct hwloc__xml_export_state_s childstate;
+      unsigned nbobjs = dist->nbobjs;
+      unsigned *logical_to_v2array;
+      int depth;
+
+      if (nbobjs != (unsigned) hwloc_get_nbobjs_by_type(topology, dist->unique_type))
+	continue;
+      if (!(dist->kind & HWLOC_DISTANCES_KIND_MEANS_LATENCY))
+	continue;
+      if (dist->kind & HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES)
+	continue;
+
+      logical_to_v2array = malloc(nbobjs * sizeof(*logical_to_v2array));
+      if (!logical_to_v2array) {
+	fprintf(stderr, "xml/export/v1: failed to allocated logical_to_v2array\n");
+	continue;
+      }
+
+      for(i=0; i<nbobjs; i++)
+	logical_to_v2array[dist->objs[i]->logical_index] = i;
+
+      /* compute the relative depth */
+      if (dist->unique_type == HWLOC_OBJ_NUMANODE) {
+	/* for NUMA nodes, use the highest normal-parent depth + 1 */
+	depth = -1;
+	for(i=0; i<nbobjs; i++) {
+	  hwloc_obj_t parent = dist->objs[i]->parent;
+	  while (hwloc__obj_type_is_memory(parent->type))
+	    parent = parent->parent;
+	  if (parent->depth+1 > depth)
+	    depth = parent->depth+1;
+	}
+      } else {
+	/* for non-NUMA nodes, increase the object depth if any of them has memory above */
+	int parent_with_memory = 0;
+	for(i=0; i<nbobjs; i++) {
+	  hwloc_obj_t parent = dist->objs[i]->parent;
+	  while (parent) {
+	    if (parent->memory_first_child) {
+	      parent_with_memory = 1;
+	      goto done;
+	    }
+	    parent = parent->parent;
+	  }
+	}
+      done:
+	depth = hwloc_get_type_depth(topology, dist->unique_type) + parent_with_memory;
+      }
+
+      state->new_child(state, &childstate, "distances");
+      sprintf(tmp, "%u", nbobjs);
+      childstate.new_prop(&childstate, "nbobjs", tmp);
+      sprintf(tmp, "%d", depth);
+      childstate.new_prop(&childstate, "relative_depth", tmp);
+      sprintf(tmp, "%f", 1.f);
+      childstate.new_prop(&childstate, "latency_base", tmp);
+      for(i=0; i<nbobjs; i++) {
+        for(j=0; j<nbobjs; j++) {
+	  /* we should export i*nbobjs+j, we translate using logical_to_v2array[] */
+	  unsigned k = logical_to_v2array[i]*nbobjs+logical_to_v2array[j];
+	  struct hwloc__xml_export_state_s greatchildstate;
+	  childstate.new_child(&childstate, &greatchildstate, "latency");
+	  sprintf(tmp, "%f", (float) dist->values[k]);
+	  greatchildstate.new_prop(&greatchildstate, "value", tmp);
+	  greatchildstate.end_object(&greatchildstate, "latency");
+	}
+      }
+      childstate.end_object(&childstate, "distances");
+      free(logical_to_v2array);
+    }
+  }
+
+  if (obj->userdata && topology->userdata_export_cb)
+    topology->userdata_export_cb((void*) state, topology, obj);
+}
+
+static void
+hwloc__xml_v2export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s state;
+  hwloc_obj_t child;
+
+  parentstate->new_child(parentstate, &state, "object");
+
+  hwloc__xml_export_object_contents(&state, topology, obj, flags);
+
+  for_each_memory_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_io_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v2export_object (&state, topology, child, flags);
+
+  state.end_object(&state, "object");
+}
+
+static void
+hwloc__xml_v1export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags);
+
+static hwloc_obj_t
+hwloc__xml_v1export_object_next_numanode(hwloc_obj_t obj, hwloc_obj_t cur)
+{
+  hwloc_obj_t parent;
+
+  if (!cur) {
+    /* first numa node is on the very bottom left */
+    cur = obj->memory_first_child;
+    goto find_first;
+  }
+
+  /* walk-up until there's a next sibling */
+  parent = cur;
+  while (1) {
+    if (parent->next_sibling) {
+      /* found a next sibling, we'll walk down-left from there */
+      cur = parent->next_sibling;
+      break;
+    }
+    parent = parent->parent;
+    if (parent == obj)
+      return NULL;
+  }
+
+ find_first:
+  while (cur->type != HWLOC_OBJ_NUMANODE)
+    cur = cur->memory_first_child;
+  assert(cur);
+  return cur;
+}
+
+static unsigned
+hwloc__xml_v1export_object_list_numanodes(hwloc_obj_t obj, hwloc_obj_t *first_p, hwloc_obj_t **nodes_p)
+{
+  hwloc_obj_t *nodes, cur;
+  int nr;
+
+  if (!obj->memory_first_child) {
+    *first_p = NULL;
+    *nodes_p = NULL;
+    return 0;
+  }
+  /* we're sure there's at least one numa node */
+
+  nr = hwloc_bitmap_weight(obj->nodeset);
+  assert(nr > 0);
+  /* these are local nodes, but some of them may be attached above instead of here */
+
+  nodes = calloc(nr, sizeof(*nodes));
+  if (!nodes) {
+    /* only return the first node */
+    cur = hwloc__xml_v1export_object_next_numanode(obj, NULL);
+    assert(cur);
+    *first_p = cur;
+    *nodes_p = NULL;
+    return 1;
+  }
+
+  nr = 0;
+  cur = NULL;
+  while (1) {
+    cur = hwloc__xml_v1export_object_next_numanode(obj, cur);
+    if (!cur)
+      break;
+    nodes[nr++] = cur;
+  }
+
+  *first_p = nodes[0];
+  *nodes_p = nodes;
+  return nr;
+}
+
+static void
+hwloc__xml_v1export_object_with_memory(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s gstate, mstate, ostate, *state = parentstate;
+  hwloc_obj_t child;
+  unsigned nr_numanodes;
+  hwloc_obj_t *numanodes, first_numanode;
+  unsigned i;
+
+  nr_numanodes = hwloc__xml_v1export_object_list_numanodes(obj, &first_numanode, &numanodes);
+
+  if (obj->parent->arity > 1 && nr_numanodes > 1 && parentstate->global->v1_memory_group) {
+    /* child has sibling, we must add a Group around those memory children */
+    hwloc_obj_t group = parentstate->global->v1_memory_group;
+    parentstate->new_child(parentstate, &gstate, "object");
+    group->cpuset = obj->cpuset;
+    group->complete_cpuset = obj->complete_cpuset;
+    group->nodeset = obj->nodeset;
+    group->complete_nodeset = obj->complete_nodeset;
+    hwloc__xml_export_object_contents (&gstate, topology, group, flags);
+    group->cpuset = NULL;
+    group->complete_cpuset = NULL;
+    group->nodeset = NULL;
+    group->complete_nodeset = NULL;
+    state = &gstate;
+  }
+
+  /* export first memory child */
+  state->new_child(state, &mstate, "object");
+  hwloc__xml_export_object_contents (&mstate, topology, first_numanode, flags);
+
+  /* then the actual object */
+  mstate.new_child(&mstate, &ostate, "object");
+  hwloc__xml_export_object_contents (&ostate, topology, obj, flags);
+
+  /* then its normal/io/misc children */
+  for_each_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+  for_each_io_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v1export_object (&ostate, topology, child, flags);
+
+  /* close object and first memory child */
+  ostate.end_object(&ostate, "object");
+  mstate.end_object(&mstate, "object");
+
+  /* now other memory children */
+  for(i=1; i<nr_numanodes; i++)
+    hwloc__xml_v1export_object (state, topology, numanodes[i], flags);
+
+  free(numanodes);
+
+  if (state == &gstate) {
+    /* close group if any */
+    gstate.end_object(&gstate, "object");
+  }
+}
+
+static void
+hwloc__xml_v1export_object (hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, hwloc_obj_t obj, unsigned long flags)
+{
+  struct hwloc__xml_export_state_s state;
+  hwloc_obj_t child;
+
+  parentstate->new_child(parentstate, &state, "object");
+
+  hwloc__xml_export_object_contents(&state, topology, obj, flags);
+
+  for_each_child(child, obj) {
+    if (!child->memory_arity) {
+      /* no memory child, just export normally */
+      hwloc__xml_v1export_object (&state, topology, child, flags);
+    } else {
+      hwloc__xml_v1export_object_with_memory(&state, topology, child, flags);
+    }
+  }
+
+  for_each_io_child(child, obj)
+    hwloc__xml_v1export_object (&state, topology, child, flags);
+  for_each_misc_child(child, obj)
+    hwloc__xml_v1export_object (&state, topology, child, flags);
+
+  state.end_object(&state, "object");
+}
+
+#define EXPORT_ARRAY(state, type, nr, values, tagname, format, maxperline) do { \
+  unsigned _i = 0; \
+  while (_i<(nr)) { \
+    char _tmp[255]; /* enough for (snprintf(format)+space) x maxperline */ \
+    char _tmp2[16]; \
+    size_t _len = 0; \
+    unsigned _j; \
+    struct hwloc__xml_export_state_s _childstate; \
+    (state)->new_child(state, &_childstate, tagname); \
+    for(_j=0; \
+	_i+_j<(nr) && _j<maxperline; \
+	_j++) \
+      _len += sprintf(_tmp+_len, format " ", (type) (values)[_i+_j]); \
+    _i += _j; \
+    sprintf(_tmp2, "%lu", (unsigned long) _len); \
+    _childstate.new_prop(&_childstate, "length", _tmp2); \
+    _childstate.add_content(&_childstate, _tmp, _len); \
+    _childstate.end_object(&_childstate, tagname); \
+  } \
+} while (0)
+
+#define EXPORT_TYPE_GPINDEX_ARRAY(state, nr, objs, tagname, maxperline) do { \
+  unsigned _i = 0; \
+  while (_i<(nr)) { \
+    char _tmp[255]; /* enough for (snprintf(type+index)+space) x maxperline */ \
+    char _tmp2[16]; \
+    size_t _len = 0; \
+    unsigned _j; \
+    struct hwloc__xml_export_state_s _childstate; \
+    (state)->new_child(state, &_childstate, tagname); \
+    for(_j=0; \
+	_i+_j<(nr) && _j<maxperline; \
+	_j++) \
+      _len += sprintf(_tmp+_len, "%s:%llu ", hwloc_obj_type_string((objs)[_i+_j]->type), (unsigned long long) (objs)[_i+_j]->gp_index); \
+    _i += _j; \
+    sprintf(_tmp2, "%lu", (unsigned long) _len); \
+    _childstate.new_prop(&_childstate, "length", _tmp2); \
+    _childstate.add_content(&_childstate, _tmp, _len); \
+    _childstate.end_object(&_childstate, tagname); \
+  } \
+} while (0)
+
+static void
+hwloc___xml_v2export_distances(hwloc__xml_export_state_t parentstate, struct hwloc_internal_distances_s *dist)
+{
+  char tmp[255];
+  unsigned nbobjs = dist->nbobjs;
+  struct hwloc__xml_export_state_s state;
+
+  if (dist->different_types) {
+    parentstate->new_child(parentstate, &state, "distances2hetero");
+  } else {
+    parentstate->new_child(parentstate, &state, "distances2");
+    state.new_prop(&state, "type", hwloc_obj_type_string(dist->unique_type));
+  }
+
+  sprintf(tmp, "%u", nbobjs);
+  state.new_prop(&state, "nbobjs", tmp);
+  sprintf(tmp, "%lu", dist->kind);
+  state.new_prop(&state, "kind", tmp);
+  if (dist->name)
+    state.new_prop(&state, "name", dist->name);
+
+  if (!dist->different_types) {
+    state.new_prop(&state, "indexing",
+		   HWLOC_DIST_TYPE_USE_OS_INDEX(dist->unique_type) ? "os" : "gp");
+  }
+
+  /* TODO don't hardwire 10 below. either snprintf the max to guess it, or just append until the end of the buffer */
+  if (dist->different_types) {
+    EXPORT_TYPE_GPINDEX_ARRAY(&state, nbobjs, dist->objs, "indexes", 10);
+  } else {
+    EXPORT_ARRAY(&state, unsigned long long, nbobjs, dist->indexes, "indexes", "%llu", 10);
+  }
+  EXPORT_ARRAY(&state, unsigned long long, nbobjs*nbobjs, dist->values, "u64values", "%llu", 10);
+  state.end_object(&state, dist->different_types ? "distances2hetero" : "distances2");
+}
+
+static void
+hwloc__xml_v2export_distances(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology)
+{
+  struct hwloc_internal_distances_s *dist;
+  for(dist = topology->first_dist; dist; dist = dist->next)
+    if (!dist->different_types)
+      hwloc___xml_v2export_distances(parentstate, dist);
+  /* export homogeneous distances first in case the importer doesn't support heterogeneous and stops there */
+  for(dist = topology->first_dist; dist; dist = dist->next)
+    if (dist->different_types)
+      hwloc___xml_v2export_distances(parentstate, dist);
+}
+
+void
+hwloc__xml_export_topology(hwloc__xml_export_state_t state, hwloc_topology_t topology, unsigned long flags)
+{
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    hwloc_obj_t *numanodes, first_numanode;
+    unsigned nr_numanodes;
+
+    nr_numanodes = hwloc__xml_v1export_object_list_numanodes(root, &first_numanode, &numanodes);
+
+    if (nr_numanodes) {
+      /* we don't use hwloc__xml_v1export_object_with_memory() because we want/can keep root above the numa node */
+      struct hwloc__xml_export_state_s rstate, mstate;
+      hwloc_obj_t child;
+      unsigned i;
+      /* export the root */
+      state->new_child(state, &rstate, "object");
+      hwloc__xml_export_object_contents (&rstate, topology, root, flags);
+      /* export first memory child */
+      rstate.new_child(&rstate, &mstate, "object");
+      hwloc__xml_export_object_contents (&mstate, topology, first_numanode, flags);
+      /* then its normal/io/misc children */
+      for_each_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      for_each_io_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      for_each_misc_child(child, root)
+	hwloc__xml_v1export_object (&mstate, topology, child, flags);
+      /* close first memory child */
+      mstate.end_object(&mstate, "object");
+      /* now other memory children */
+      for(i=1; i<nr_numanodes; i++)
+	hwloc__xml_v1export_object (&rstate, topology, numanodes[i], flags);
+      /* close the root */
+      rstate.end_object(&rstate, "object");
+    } else {
+      hwloc__xml_v1export_object(state, topology, root, flags);
+    }
+
+    free(numanodes);
+
+  } else {
+    hwloc__xml_v2export_object (state, topology, root, flags);
+    hwloc__xml_v2export_distances (state, topology);
+  }
+}
+
+void
+hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff)
+{
+  while (diff) {
+    struct hwloc__xml_export_state_s state;
+    char tmp[255];
+
+    parentstate->new_child(parentstate, &state, "diff");
+
+    sprintf(tmp, "%d", (int) diff->generic.type);
+    state.new_prop(&state, "type", tmp);
+
+    switch (diff->generic.type) {
+    case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+      sprintf(tmp, "%d", diff->obj_attr.obj_depth);
+      state.new_prop(&state, "obj_depth", tmp);
+      sprintf(tmp, "%u", diff->obj_attr.obj_index);
+      state.new_prop(&state, "obj_index", tmp);
+
+      sprintf(tmp, "%d", (int) diff->obj_attr.diff.generic.type);
+      state.new_prop(&state, "obj_attr_type", tmp);
+
+      switch (diff->obj_attr.diff.generic.type) {
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE:
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.index);
+	state.new_prop(&state, "obj_attr_index", tmp);
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.oldvalue);
+	state.new_prop(&state, "obj_attr_oldvalue", tmp);
+	sprintf(tmp, "%llu", (unsigned long long) diff->obj_attr.diff.uint64.newvalue);
+	state.new_prop(&state, "obj_attr_newvalue", tmp);
+	break;
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+      case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+	if (diff->obj_attr.diff.string.name)
+	  state.new_prop(&state, "obj_attr_name", diff->obj_attr.diff.string.name);
+	state.new_prop(&state, "obj_attr_oldvalue", diff->obj_attr.diff.string.oldvalue);
+	state.new_prop(&state, "obj_attr_newvalue", diff->obj_attr.diff.string.newvalue);
+	break;
+      }
+
+      break;
+    default:
+      assert(0);
+    }
+    state.end_object(&state, "diff");
+
+    diff = diff->generic.next;
+  }
+}
+
+/**********************************
+ ********* main XML export ********
+ **********************************/
+
+/* this can be the first XML call */
+int hwloc_topology_export_xml(hwloc_topology_t topology, const char *filename, unsigned long flags)
+{
+  hwloc_localeswitch_declare;
+  struct hwloc__xml_export_data_s edata;
+  int force_nolibxml;
+  int ret;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  if (flags & ~HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  hwloc_internal_distances_refresh(topology);
+
+  hwloc_localeswitch_init();
+
+  edata.v1_memory_group = NULL;
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)
+    /* temporary group to be used during v1 export of memory children */
+    edata.v1_memory_group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_file(topology, &edata, filename, flags);
+  else {
+    ret = hwloc_libxml_callbacks->export_file(topology, &edata, filename, flags);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  if (edata.v1_memory_group)
+    hwloc_free_unlinked_object(edata.v1_memory_group);
+
+  hwloc_localeswitch_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags)
+{
+  hwloc_localeswitch_declare;
+  struct hwloc__xml_export_data_s edata;
+  int force_nolibxml;
+  int ret;
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  if (flags & ~HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  hwloc_internal_distances_refresh(topology);
+
+  hwloc_localeswitch_init();
+
+  edata.v1_memory_group = NULL;
+  if (flags & HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1)
+    /* temporary group to be used during v1 export of memory children */
+    edata.v1_memory_group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_buffer(topology, &edata, xmlbuffer, buflen, flags);
+  else {
+    ret = hwloc_libxml_callbacks->export_buffer(topology, &edata, xmlbuffer, buflen, flags);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  if (edata.v1_memory_group)
+    hwloc_free_unlinked_object(edata.v1_memory_group);
+
+  hwloc_localeswitch_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname,
+			       const char *filename)
+{
+  hwloc_localeswitch_declare;
+  hwloc_topology_diff_t tmpdiff;
+  int force_nolibxml;
+  int ret;
+
+  tmpdiff = diff;
+  while (tmpdiff) {
+    if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+      errno = EINVAL;
+      return -1;
+    }
+    tmpdiff = tmpdiff->generic.next;
+  }
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_diff_file(diff, refname, filename);
+  else {
+    ret = hwloc_libxml_callbacks->export_diff_file(diff, refname, filename);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  return ret;
+}
+
+/* this can be the first XML call */
+int
+hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname,
+				     char **xmlbuffer, int *buflen)
+{
+  hwloc_localeswitch_declare;
+  hwloc_topology_diff_t tmpdiff;
+  int force_nolibxml;
+  int ret;
+
+  tmpdiff = diff;
+  while (tmpdiff) {
+    if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+      errno = EINVAL;
+      return -1;
+    }
+    tmpdiff = tmpdiff->generic.next;
+  }
+
+  hwloc_components_init();
+  assert(hwloc_nolibxml_callbacks);
+
+  hwloc_localeswitch_init();
+
+  force_nolibxml = hwloc_nolibxml_export();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    ret = hwloc_nolibxml_callbacks->export_diff_buffer(diff, refname, xmlbuffer, buflen);
+  else {
+    ret = hwloc_libxml_callbacks->export_diff_buffer(diff, refname, xmlbuffer, buflen);
+    if (ret < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+
+  hwloc_localeswitch_fini();
+  hwloc_components_fini();
+  return ret;
+}
+
+void hwloc_free_xmlbuffer(hwloc_topology_t topology __hwloc_attribute_unused, char *xmlbuffer)
+{
+  int force_nolibxml;
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the topology */
+
+  force_nolibxml = hwloc_nolibxml_export();
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    hwloc_nolibxml_callbacks->free_buffer(xmlbuffer);
+  else
+    hwloc_libxml_callbacks->free_buffer(xmlbuffer);
+}
+
+void
+hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+					    void (*export)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj))
+{
+  topology->userdata_export_cb = export;
+}
+
+static void
+hwloc__export_obj_userdata(hwloc__xml_export_state_t parentstate, int encoded,
+			   const char *name, size_t length, const void *buffer, size_t encoded_length)
+{
+  struct hwloc__xml_export_state_s state;
+  char tmp[255];
+  parentstate->new_child(parentstate, &state, "userdata");
+  if (name)
+    state.new_prop(&state, "name", name);
+  sprintf(tmp, "%lu", (unsigned long) length);
+  state.new_prop(&state, "length", tmp);
+  if (encoded)
+    state.new_prop(&state, "encoding", "base64");
+  if (encoded_length)
+    state.add_content(&state, buffer, encoded ? encoded_length : length);
+  state.end_object(&state, "userdata");
+}
+
+int
+hwloc_export_obj_userdata(void *reserved,
+			  struct hwloc_topology *topology, struct hwloc_obj *obj __hwloc_attribute_unused,
+			  const char *name, const void *buffer, size_t length)
+{
+  hwloc__xml_export_state_t state = reserved;
+
+  if (!buffer) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if ((name && hwloc__xml_export_check_buffer(name, strlen(name)) < 0)
+      || hwloc__xml_export_check_buffer(buffer, length) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (topology->userdata_not_decoded) {
+    int encoded;
+    size_t encoded_length;
+    const char *realname;
+    if (!strncmp(name, "base64", 6)) {
+      encoded = 1;
+      encoded_length = BASE64_ENCODED_LENGTH(length);
+    } else {
+      assert(!strncmp(name, "normal", 6));
+      encoded = 0;
+      encoded_length = length;
+    }
+    if (name[6] == ':')
+      realname = name+7;
+    else {
+      assert(!strcmp(name+6, "-anon"));
+      realname = NULL;
+    }
+    hwloc__export_obj_userdata(state, encoded, realname, length, buffer, encoded_length);
+
+  } else
+    hwloc__export_obj_userdata(state, 0, name, length, buffer, length);
+
+  return 0;
+}
+
+int
+hwloc_export_obj_userdata_base64(void *reserved,
+				 struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *obj __hwloc_attribute_unused,
+				 const char *name, const void *buffer, size_t length)
+{
+  hwloc__xml_export_state_t state = reserved;
+  size_t encoded_length;
+  char *encoded_buffer;
+  int ret __hwloc_attribute_unused;
+
+  if (!buffer) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  assert(!topology->userdata_not_decoded);
+
+  if (name && hwloc__xml_export_check_buffer(name, strlen(name)) < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  encoded_length = BASE64_ENCODED_LENGTH(length);
+  encoded_buffer = malloc(encoded_length+1);
+  if (!encoded_buffer) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  ret = hwloc_encode_to_base64(buffer, length, encoded_buffer, encoded_length+1);
+  assert(ret == (int) encoded_length);
+
+  hwloc__export_obj_userdata(state, 1, name, length, encoded_buffer, encoded_length);
+
+  free(encoded_buffer);
+  return 0;
+}
+
+void
+hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+					    void (*import)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length))
+{
+  topology->userdata_import_cb = import;
+}
+
+/***************************************
+ ************ XML component ************
+ ***************************************/
+
+static void
+hwloc_xml_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_xml_backend_data_s *data = backend->private_data;
+  data->backend_exit(data);
+  free(data->msgprefix);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_xml_component_instantiate(struct hwloc_topology *topology,
+				struct hwloc_disc_component *component,
+				unsigned excluded_phases __hwloc_attribute_unused,
+				const void *_data1,
+				const void *_data2,
+				const void *_data3)
+{
+  struct hwloc_xml_backend_data_s *data;
+  struct hwloc_backend *backend;
+  const char *env;
+  int force_nolibxml;
+  const char * xmlpath = (const char *) _data1;
+  const char * xmlbuffer = (const char *) _data2;
+  int xmlbuflen = (int)(uintptr_t) _data3;
+  const char *local_basename;
+  int err;
+
+  assert(hwloc_nolibxml_callbacks); /* the core called components_init() for the component's topology */
+
+  if (!xmlpath && !xmlbuffer) {
+    env = getenv("HWLOC_XMLFILE");
+    if (env) {
+      /* 'xml' was given in HWLOC_COMPONENTS without a filename */
+      xmlpath = env;
+    } else {
+      errno = EINVAL;
+      goto out;
+    }
+  }
+
+  backend = hwloc_backend_alloc(topology, component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_xml;
+  backend->disable = hwloc_xml_backend_disable;
+  backend->is_thissystem = 0;
+
+  if (xmlpath) {
+    local_basename = strrchr(xmlpath, '/');
+    if (local_basename)
+      local_basename++;
+    else
+      local_basename = xmlpath;
+  } else {
+    local_basename = "xmlbuffer";
+  }
+  data->msgprefix = strdup(local_basename);
+
+  force_nolibxml = hwloc_nolibxml_import();
+retry:
+  if (!hwloc_libxml_callbacks || (hwloc_nolibxml_callbacks && force_nolibxml))
+    err = hwloc_nolibxml_callbacks->backend_init(data, xmlpath, xmlbuffer, xmlbuflen);
+  else {
+    err = hwloc_libxml_callbacks->backend_init(data, xmlpath, xmlbuffer, xmlbuflen);
+    if (err < 0 && errno == ENOSYS) {
+      hwloc_libxml_callbacks = NULL;
+      goto retry;
+    }
+  }
+  if (err < 0)
+    goto out_with_data;
+
+  return backend;
+
+ out_with_data:
+  free(data->msgprefix);
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_xml_disc_component = {
+  "xml",
+  HWLOC_DISC_PHASE_GLOBAL,
+  ~0,
+  hwloc_xml_component_instantiate,
+  30,
+  1,
+  NULL
+};
+
+const struct hwloc_component hwloc_xml_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_xml_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology.c b/ext/hwloc/hwloc/topology.c
index a67d036c9..8d376193b 100644
--- a/ext/hwloc/hwloc/topology.c
+++ b/ext/hwloc/hwloc/topology.c
@@ -1,12 +1,12 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
+#include "private/autogen/config.h"
 
 #define _ATFILE_SOURCE
 #include <assert.h>
@@ -25,10 +25,10 @@
 #include <limits.h>
 #include <float.h>
 
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/debug.h>
-#include <private/misc.h>
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/debug.h"
+#include "private/misc.h"
 
 #ifdef HAVE_MACH_MACH_INIT_H
 #include <mach/mach_init.h>
@@ -54,6 +54,11 @@ unsigned hwloc_get_api_version(void)
   return HWLOC_API_VERSION;
 }
 
+int hwloc_topology_abi_check(hwloc_topology_t topology)
+{
+  return topology->topology_abi != HWLOC_TOPOLOGY_ABI ? -1 : 0;
+}
+
 int hwloc_hide_errors(void)
 {
   static int hide = 0;
@@ -69,26 +74,28 @@ int hwloc_hide_errors(void)
 
 void hwloc_report_os_error(const char *msg, int line)
 {
-    static int reported = 0;
-
-    if (!reported && !hwloc_hide_errors()) {
-        fprintf(stderr, "****************************************************************************\n");
-        fprintf(stderr, "* hwloc %s has encountered what looks like an error from the operating system.\n", HWLOC_VERSION);
-        fprintf(stderr, "*\n");
-        fprintf(stderr, "* %s\n", msg);
-        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
-        fprintf(stderr, "*\n");
-        fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
-        fprintf(stderr, "*   What should I do when hwloc reports \"operating system\" warnings?\n");
-        fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
+  static int reported = 0;
+
+  if (!reported && !hwloc_hide_errors()) {
+    fprintf(stderr, "****************************************************************************\n");
+    fprintf(stderr, "* hwloc %s received invalid information from the operating system.\n", HWLOC_VERSION);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* %s\n", msg);
+    fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+    fprintf(stderr, "*\n");
+    fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
+    fprintf(stderr, "*   What should I do when hwloc reports \"operating system\" warnings?\n");
+    fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
 #ifdef HWLOC_LINUX_SYS
-        fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+    fprintf(stderr, "* along with the files generated by the hwloc-gather-topology script.\n");
 #else
-	fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+    fprintf(stderr, "* along with any relevant topology information from your platform.\n");
 #endif
-        fprintf(stderr, "****************************************************************************\n");
-        reported = 1;
-    }
+    fprintf(stderr, "* \n");
+    fprintf(stderr, "* hwloc will now ignore this invalid topology information and continue.\n");
+    fprintf(stderr, "****************************************************************************\n");
+    reported = 1;
+  }
 }
 
 #if defined(HAVE_SYSCTLBYNAME)
@@ -129,13 +136,28 @@ int hwloc_get_sysctl(int name[], unsigned namelen, int *ret)
 }
 #endif
 
-/* Return the OS-provided number of processors.  Unlike other methods such as
-   reading sysfs on Linux, this method is not virtualizable; thus it's only
-   used as a fall-back method, allowing virtual backends (FSROOT, etc) to
-   have the desired effect.  */
-unsigned
-hwloc_fallback_nbprocessors(struct hwloc_topology *topology) {
+/* Return the OS-provided number of processors.
+ * Assumes topology->is_thissystem is true.
+ */
+#ifndef HWLOC_WIN_SYS /* The windows implementation is in topology-windows.c */
+int
+hwloc_fallback_nbprocessors(unsigned flags) {
   int n;
+
+  if (flags & HWLOC_FALLBACK_NBPROCESSORS_INCLUDE_OFFLINE) {
+    /* try to get all CPUs for Linux and Solaris that can handle offline CPUs */
+#if HAVE_DECL__SC_NPROCESSORS_CONF
+    n = sysconf(_SC_NPROCESSORS_CONF);
+#elif HAVE_DECL__SC_NPROC_CONF
+    n = sysconf(_SC_NPROC_CONF);
+#else
+    n = -1;
+#endif
+    if (n != -1)
+      return n;
+  }
+
+  /* try getting only online CPUs, or whatever we can get */
 #if HAVE_DECL__SC_NPROCESSORS_ONLN
   n = sysconf(_SC_NPROCESSORS_ONLN);
 #elif HAVE_DECL__SC_NPROC_ONLN
@@ -155,26 +177,18 @@ hwloc_fallback_nbprocessors(struct hwloc_topology *topology) {
     nn = -1;
   n = nn;
 #elif defined(HAVE_SYSCTL) && HAVE_DECL_CTL_HW && HAVE_DECL_HW_NCPU
-  static int name[2] = {CTL_HW, HW_NPCU};
-  if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name)), &n)
+  static int name[2] = {CTL_HW, HW_NCPU};
+  if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name), &n))
     n = -1;
-#elif defined(HWLOC_WIN_SYS)
-  SYSTEM_INFO sysinfo;
-  GetSystemInfo(&sysinfo);
-  n = sysinfo.dwNumberOfProcessors;
 #else
 #ifdef __GNUC__
 #warning No known way to discover number of available processors on this system
-#warning hwloc_fallback_nbprocessors will default to 1
 #endif
   n = -1;
 #endif
-  if (n >= 1)
-    topology->support.discovery->pu = 1;
-  else
-    n = 1;
   return n;
 }
+#endif /* !HWLOC_WIN_SYS */
 
 /*
  * Use the given number of processors to set a PU level.
@@ -189,7 +203,7 @@ hwloc_setup_pu_level(struct hwloc_topology *topology,
   hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
   for (cpu=0,oscpu=0; cpu<nb_pus; oscpu++)
     {
-      obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, oscpu);
+      obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, oscpu);
       obj->cpuset = hwloc_bitmap_alloc();
       hwloc_bitmap_only(obj->cpuset, oscpu);
 
@@ -201,22 +215,55 @@ hwloc_setup_pu_level(struct hwloc_topology *topology,
     }
 }
 
+/* Traverse children of a parent in a safe way: reread the next pointer as
+ * appropriate to prevent crash on child deletion:  */
+#define for_each_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_memory_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->memory_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_io_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->io_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_misc_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->misc_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+
 #ifdef HWLOC_DEBUG
 /* Just for debugging.  */
 static void
 hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
 {
-  char type[64], idx[10], attr[1024], *cpuset = NULL;
+  char type[64], idx[12], attr[1024], *cpuset = NULL;
   hwloc_debug("%*s", 2*indent, "");
   hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
-  if (obj->os_index != (unsigned) -1)
+  if (obj->os_index != HWLOC_UNKNOWN_INDEX)
     snprintf(idx, sizeof(idx), "#%u", obj->os_index);
   else
     *idx = '\0';
   hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", 1);
   hwloc_debug("%s%s%s%s%s", type, idx, *attr ? "(" : "", attr, *attr ? ")" : "");
   if (obj->name)
-    hwloc_debug(" name %s", obj->name);
+    hwloc_debug(" name \"%s\"", obj->name);
+  if (obj->subtype)
+    hwloc_debug(" subtype \"%s\"", obj->subtype);
   if (obj->cpuset) {
     hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
     hwloc_debug(" cpuset %s", cpuset);
@@ -227,11 +274,6 @@ hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
     hwloc_debug(" complete %s", cpuset);
     free(cpuset);
   }
-  if (obj->allowed_cpuset) {
-    hwloc_bitmap_asprintf(&cpuset, obj->allowed_cpuset);
-    hwloc_debug(" allowed %s", cpuset);
-    free(cpuset);
-  }
   if (obj->nodeset) {
     hwloc_bitmap_asprintf(&cpuset, obj->nodeset);
     hwloc_debug(" nodeset %s", cpuset);
@@ -242,11 +284,6 @@ hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
     hwloc_debug(" completeN %s", cpuset);
     free(cpuset);
   }
-  if (obj->allowed_nodeset) {
-    hwloc_bitmap_asprintf(&cpuset, obj->allowed_nodeset);
-    hwloc_debug(" allowedN %s", cpuset);
-    free(cpuset);
-  }
   if (obj->arity)
     hwloc_debug(" arity %u", obj->arity);
   hwloc_debug("%s", "\n");
@@ -257,11 +294,13 @@ hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
 {
   hwloc_obj_t child;
   hwloc_debug_print_object(indent, obj);
-  for (child = obj->first_child; child; child = child->next_sibling)
+  for_each_child (child, obj)
     hwloc_debug_print_objects(indent + 1, child);
-  for (child = obj->io_first_child; child; child = child->next_sibling)
+  for_each_memory_child (child, obj)
     hwloc_debug_print_objects(indent + 1, child);
-  for (child = obj->misc_first_child; child; child = child->next_sibling)
+  for_each_io_child (child, obj)
+    hwloc_debug_print_objects(indent + 1, child);
+  for_each_misc_child (child, obj)
     hwloc_debug_print_objects(indent + 1, child);
 }
 #else /* !HWLOC_DEBUG */
@@ -269,7 +308,7 @@ hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
 #define hwloc_debug_print_objects(indent, obj) do { /* nothing */ } while (0)
 #endif /* !HWLOC_DEBUG */
 
-void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count)
+void hwloc__free_infos(struct hwloc_info_s *infos, unsigned count)
 {
   unsigned i;
   for(i=0; i<count; i++) {
@@ -279,45 +318,76 @@ void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count)
   free(infos);
 }
 
-void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value)
+int hwloc__add_info(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value)
 {
   unsigned count = *countp;
-  struct hwloc_obj_info_s *infos = *infosp;
+  struct hwloc_info_s *infos = *infosp;
 #define OBJECT_INFO_ALLOC 8
   /* nothing allocated initially, (re-)allocate by multiple of 8 */
   unsigned alloccount = (count + 1 + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
-  if (count != alloccount)
-    infos = realloc(infos, alloccount*sizeof(*infos));
+  if (count != alloccount) {
+    struct hwloc_info_s *tmpinfos = realloc(infos, alloccount*sizeof(*infos));
+    if (!tmpinfos)
+      /* failed to allocate, ignore this info */
+      goto out_with_array;
+    *infosp = infos = tmpinfos;
+  }
   infos[count].name = strdup(name);
-  infos[count].value = value ? strdup(value) : NULL;
-  *infosp = infos;
+  if (!infos[count].name)
+    goto out_with_array;
+  infos[count].value = strdup(value);
+  if (!infos[count].value)
+    goto out_with_name;
   *countp = count+1;
+  return 0;
+
+ out_with_name:
+  free(infos[count].name);
+ out_with_array:
+  /* don't bother reducing the array */
+  return -1;
 }
 
-char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name)
+int hwloc__add_info_nodup(struct hwloc_info_s **infosp, unsigned *countp,
+			  const char *name, const char *value,
+			  int replace)
 {
+  struct hwloc_info_s *infos = *infosp;
+  unsigned count = *countp;
   unsigned i;
-  for(i=0; i<*countp; i++) {
-    if (!strcmp((*infosp)[i].name, name))
-      return &(*infosp)[i].value;
+  for(i=0; i<count; i++) {
+    if (!strcmp(infos[i].name, name)) {
+      if (replace) {
+	char *new = strdup(value);
+	if (!new)
+	  return -1;
+	free(infos[i].value);
+	infos[i].value = new;
+      }
+      return 0;
+    }
   }
-  hwloc__add_info(infosp, countp, name, NULL);
-  return &(*infosp)[*countp-1].value;
+  return hwloc__add_info(infosp, countp, name, value);
 }
 
-void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp,
-		       struct hwloc_obj_info_s **src_infosp, unsigned *src_countp)
+int hwloc__move_infos(struct hwloc_info_s **dst_infosp, unsigned *dst_countp,
+		      struct hwloc_info_s **src_infosp, unsigned *src_countp)
 {
   unsigned dst_count = *dst_countp;
-  struct hwloc_obj_info_s *dst_infos = *dst_infosp;
+  struct hwloc_info_s *dst_infos = *dst_infosp;
   unsigned src_count = *src_countp;
-  struct hwloc_obj_info_s *src_infos = *src_infosp;
+  struct hwloc_info_s *src_infos = *src_infosp;
   unsigned i;
 #define OBJECT_INFO_ALLOC 8
   /* nothing allocated initially, (re-)allocate by multiple of 8 */
   unsigned alloccount = (dst_count + src_count + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
-  if (dst_count != alloccount)
-    dst_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+  if (dst_count != alloccount) {
+    struct hwloc_info_s *tmp_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+    if (!tmp_infos)
+      /* Failed to realloc, ignore the appended infos */
+      goto drop;
+    dst_infos = tmp_infos;
+  }
   for(i=0; i<src_count; i++, dst_count++) {
     dst_infos[dst_count].name = src_infos[i].name;
     dst_infos[dst_count].value = src_infos[i].value;
@@ -327,81 +397,142 @@ void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_count
   free(src_infos);
   *src_infosp = NULL;
   *src_countp = 0;
-}
+  return 0;
 
-void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
-{
-  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+ drop:
+  /* drop src infos, don't modify dst_infos at all */
+  for(i=0; i<src_count; i++) {
+    free(src_infos[i].name);
+    free(src_infos[i].value);
+  }
+  free(src_infos);
+  *src_infosp = NULL;
+  *src_countp = 0;
+  return -1;
 }
 
-void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup)
+int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
 {
-  if (nodup && hwloc_obj_get_info_by_name(obj, name))
-    return;
-  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+  return hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
 }
 
-static int hwloc_obj_type_is_special (hwloc_obj_type_t type)
-{
-  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC + 1 == HWLOC_OBJ_BRIDGE);
-  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
-  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
-  return type >= HWLOC_OBJ_MISC && type <= HWLOC_OBJ_OS_DEVICE;
-}
-static int hwloc_obj_type_is_io (hwloc_obj_type_t type)
+/* This function may be called with topology->tma set, it cannot free() or realloc() */
+static int hwloc__tma_dup_infos(struct hwloc_tma *tma, hwloc_obj_t new, hwloc_obj_t src)
 {
-  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
-  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
-  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
-}
+  unsigned i, j;
+  new->infos = hwloc_tma_calloc(tma, src->infos_count * sizeof(*src->infos));
+  if (!new->infos)
+    return -1;
+  for(i=0; i<src->infos_count; i++) {
+    new->infos[i].name = hwloc_tma_strdup(tma, src->infos[i].name);
+    new->infos[i].value = hwloc_tma_strdup(tma, src->infos[i].value);
+    if (!new->infos[i].name || !new->infos[i].value)
+      goto failed;
+  }
+  new->infos_count = src->infos_count;
+  return 0;
 
-/* Traverse children of a parent in a safe way: reread the next pointer as
- * appropriate to prevent crash on child deletion:  */
-#define for_each_child_safe(child, parent, pchild) \
-  for (pchild = &(parent)->first_child, child = *pchild; \
-       child; \
-       /* Check whether the current child was not dropped.  */ \
-       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
-       /* Get pointer to next child.  */ \
-        child = *pchild)
-#define for_each_io_child_safe(child, parent, pchild) \
-  for (pchild = &(parent)->io_first_child, child = *pchild; \
-       child; \
-       /* Check whether the current child was not dropped.  */ \
-       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
-       /* Get pointer to next child.  */ \
-        child = *pchild)
-#define for_each_misc_child_safe(child, parent, pchild) \
-  for (pchild = &(parent)->misc_first_child, child = *pchild; \
-       child; \
-       /* Check whether the current child was not dropped.  */ \
-       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
-       /* Get pointer to next child.  */ \
-        child = *pchild)
+ failed:
+  assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+  for(j=0; j<=i; j++) {
+    free(new->infos[i].name);
+    free(new->infos[i].value);
+  }
+  free(new->infos);
+  new->infos = NULL;
+  return -1;
+}
 
-/* Free an object and all its content.  */
-void
-hwloc_free_unlinked_object(hwloc_obj_t obj)
+static void
+hwloc__free_object_contents(hwloc_obj_t obj)
 {
   switch (obj->type) {
+  case HWLOC_OBJ_NUMANODE:
+    free(obj->attr->numanode.page_types);
+    break;
   default:
     break;
   }
   hwloc__free_infos(obj->infos, obj->infos_count);
-  hwloc_clear_object_distances(obj);
-  free(obj->memory.page_types);
   free(obj->attr);
   free(obj->children);
+  free(obj->subtype);
   free(obj->name);
   hwloc_bitmap_free(obj->cpuset);
   hwloc_bitmap_free(obj->complete_cpuset);
-  hwloc_bitmap_free(obj->allowed_cpuset);
   hwloc_bitmap_free(obj->nodeset);
   hwloc_bitmap_free(obj->complete_nodeset);
-  hwloc_bitmap_free(obj->allowed_nodeset);
+}
+
+/* Free an object and all its content.  */
+void
+hwloc_free_unlinked_object(hwloc_obj_t obj)
+{
+  hwloc__free_object_contents(obj);
   free(obj);
 }
 
+/* Replace old with contents of new object, and make new freeable by the caller.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ */
+static void
+hwloc_replace_linked_object(hwloc_obj_t old, hwloc_obj_t new)
+{
+  /* drop old fields */
+  hwloc__free_object_contents(old);
+  /* copy old tree pointers to new */
+  new->parent = old->parent;
+  new->next_sibling = old->next_sibling;
+  new->first_child = old->first_child;
+  new->memory_first_child = old->memory_first_child;
+  new->io_first_child = old->io_first_child;
+  new->misc_first_child = old->misc_first_child;
+  /* copy new contents to old now that tree pointers are OK */
+  memcpy(old, new, sizeof(*old));
+  /* clear new to that we may free it */
+  memset(new, 0,sizeof(*new));
+}
+
+/* Remove an object and its children from its parent and free them.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery or during destroy.
+ */
+static void
+unlink_and_free_object_and_children(hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_memory_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_io_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_misc_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+
+  *pobj = obj->next_sibling;
+  hwloc_free_unlinked_object(obj);
+}
+
+/* Free an object and its children without unlinking from parent.
+ */
+void
+hwloc_free_object_and_children(hwloc_obj_t obj)
+{
+  unlink_and_free_object_and_children(&obj);
+}
+
+/* Free an object, its next siblings and their children without unlinking from parent.
+ */
+void
+hwloc_free_object_siblings_and_children(hwloc_obj_t obj)
+{
+  while (obj)
+    unlink_and_free_object_and_children(&obj);
+}
+
 /* insert the (non-empty) list of sibling starting at firstnew as new children of newparent,
  * and return the address of the pointer to the next one
  */
@@ -419,16 +550,57 @@ insert_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newp
   return &tmp->next_sibling;
 }
 
+/* Take the new list starting at firstnew and prepend it to the old list starting at *firstp,
+ * and mark the new children as children of newparent.
+ * May be used during early or late discovery (updates prev_sibling and sibling_rank).
+ * List firstnew must be non-NULL.
+ */
+static void
+prepend_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t *tmpp, tmp, last;
+  unsigned length;
+
+  /* update parent pointers and find the length and end of the new list */
+  for(length = 0, tmpp = &firstnew, last = NULL ; *tmpp; length++, last = *tmpp, tmpp = &((*tmpp)->next_sibling))
+    (*tmpp)->parent = newparent;
+
+  /* update sibling_rank */
+  for(tmp = *firstp; tmp; tmp = tmp->next_sibling)
+    tmp->sibling_rank += length; /* if it wasn't initialized yet, it'll be overwritten later */
+
+  /* place the existing list at the end of the new one */
+  *tmpp = *firstp;
+  if (*firstp)
+    (*firstp)->prev_sibling = last;
+
+  /* use the beginning of the new list now */
+  *firstp = firstnew;
+}
+
+/* Take the new list starting at firstnew and append it to the old list starting at *firstp,
+ * and mark the new children as children of newparent.
+ * May be used during early or late discovery (updates prev_sibling and sibling_rank).
+ */
 static void
 append_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
 {
-  hwloc_obj_t *tmpp, tmp;
-  /* find the end of the list */
-  for(tmpp = firstp ; *tmpp; tmpp = &((*tmpp)->next_sibling));
-  *tmpp = firstnew;
-  /* update parent pointers */
-  for(tmp = firstnew; tmp; tmp = tmp->next_sibling)
+  hwloc_obj_t *tmpp, tmp, last;
+  unsigned length;
+
+  /* find the length and end of the existing list */
+  for(length = 0, tmpp = firstp, last = NULL ; *tmpp; length++, last = *tmpp, tmpp = &((*tmpp)->next_sibling));
+
+  /* update parent pointers and sibling_rank */
+  for(tmp = firstnew; tmp; tmp = tmp->next_sibling) {
     tmp->parent = newparent;
+    tmp->sibling_rank += length; /* if it wasn't set yet, it'll be overwritten later */
+  }
+
+  /* place new list at the end of the old one */
+  *tmpp = firstnew;
+  if (firstnew)
+    firstnew->prev_sibling = last;
 }
 
 /* Remove an object from its parent and free it.
@@ -450,7 +622,8 @@ unlink_and_free_single_object(hwloc_obj_t *pparent)
 
     /* no normal children */
     assert(!old->first_child);
-
+    /* no memory children */
+    assert(!old->memory_first_child);
     /* no I/O children */
     assert(!old->io_first_child);
 
@@ -462,11 +635,13 @@ unlink_and_free_single_object(hwloc_obj_t *pparent)
     /* append old siblings back */
     *lastp = old->next_sibling;
 
-  } else if (hwloc_obj_type_is_io(old->type)) {
+  } else if (hwloc__obj_type_is_io(old->type)) {
     /* I/O object */
 
     /* no normal children */
     assert(!old->first_child);
+    /* no memory children */
+    assert(!old->memory_first_child);
 
     if (old->io_first_child)
       /* insert old I/O object children as new siblings below parent instead of old */
@@ -480,6 +655,26 @@ unlink_and_free_single_object(hwloc_obj_t *pparent)
     if (old->misc_first_child)
       append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
 
+  } else if (hwloc__obj_type_is_memory(old->type)) {
+    /* memory object */
+
+    /* no normal children */
+    assert(!old->first_child);
+    /* no I/O children */
+    assert(!old->io_first_child);
+
+    if (old->memory_first_child)
+      /* insert old memory object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->memory_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old Misc children to parent */
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
   } else {
     /* Normal object */
 
@@ -491,9 +686,11 @@ unlink_and_free_single_object(hwloc_obj_t *pparent)
     /* append old siblings back */
     *lastp = old->next_sibling;
 
-    /* append old I/O and Misc children to parent
+    /* append old memory, I/O and Misc children to parent
      * old->parent cannot be NULL (removing root), misc children should have been moved by the caller earlier.
      */
+    if (old->memory_first_child)
+      append_siblings_list(&old->parent->memory_first_child, old->memory_first_child, old->parent);
     if (old->io_first_child)
       append_siblings_list(&old->parent->io_first_child, old->io_first_child, old->parent);
     if (old->misc_first_child)
@@ -503,107 +700,215 @@ unlink_and_free_single_object(hwloc_obj_t *pparent)
   hwloc_free_unlinked_object(old);
 }
 
-/* Remove an object and its children from its parent and free them.
- * Only updates next_sibling/first_child pointers,
- * so may only be used during early discovery.
- */
-static void
-unlink_and_free_object_and_children(hwloc_obj_t *pobj)
-{
-  hwloc_obj_t obj = *pobj, child, *pchild;
-
-  for_each_child_safe(child, obj, pchild)
-    unlink_and_free_object_and_children(pchild);
-  for_each_io_child_safe(child, obj, pchild)
-    unlink_and_free_object_and_children(pchild);
-  for_each_misc_child_safe(child, obj, pchild)
-    unlink_and_free_object_and_children(pchild);
-
-  *pobj = obj->next_sibling;
-  hwloc_free_unlinked_object(obj);
-}
-
-static void
-hwloc__duplicate_object(struct hwloc_obj *newobj,
+/* This function may use a tma, it cannot free() or realloc() */
+static int
+hwloc__duplicate_object(struct hwloc_topology *newtopology,
+			struct hwloc_obj *newparent,
+			struct hwloc_obj *newobj,
 			struct hwloc_obj *src)
 {
+  struct hwloc_tma *tma = newtopology->tma;
+  hwloc_obj_t *level;
+  unsigned level_width;
   size_t len;
   unsigned i;
+  hwloc_obj_t child, prev;
+  int err = 0;
+
+  /* either we're duplicating to an already allocated new root, which has no newparent,
+   * or we're duplicating to a non-yet allocated new non-root, which will have a newparent.
+   */
+  assert(!newparent == !!newobj);
+
+  if (!newobj) {
+    newobj = hwloc_alloc_setup_object(newtopology, src->type, src->os_index);
+    if (!newobj)
+      return -1;
+  }
+
+  /* duplicate all non-object-pointer fields */
+  newobj->logical_index = src->logical_index;
+  newobj->depth = src->depth;
+  newobj->sibling_rank = src->sibling_rank;
 
   newobj->type = src->type;
   newobj->os_index = src->os_index;
+  newobj->gp_index = src->gp_index;
+  newobj->symmetric_subtree = src->symmetric_subtree;
 
   if (src->name)
-    newobj->name = strdup(src->name);
+    newobj->name = hwloc_tma_strdup(tma, src->name);
+  if (src->subtype)
+    newobj->subtype = hwloc_tma_strdup(tma, src->subtype);
   newobj->userdata = src->userdata;
 
-  memcpy(&newobj->memory, &src->memory, sizeof(struct hwloc_obj_memory_s));
-  if (src->memory.page_types_len) {
-    len = src->memory.page_types_len * sizeof(struct hwloc_obj_memory_page_type_s);
-    newobj->memory.page_types = malloc(len);
-    memcpy(newobj->memory.page_types, src->memory.page_types, len);
-  }
+  newobj->total_memory = src->total_memory;
 
   memcpy(newobj->attr, src->attr, sizeof(*newobj->attr));
 
-  newobj->cpuset = hwloc_bitmap_dup(src->cpuset);
-  newobj->complete_cpuset = hwloc_bitmap_dup(src->complete_cpuset);
-  newobj->allowed_cpuset = hwloc_bitmap_dup(src->allowed_cpuset);
-  newobj->nodeset = hwloc_bitmap_dup(src->nodeset);
-  newobj->complete_nodeset = hwloc_bitmap_dup(src->complete_nodeset);
-  newobj->allowed_nodeset = hwloc_bitmap_dup(src->allowed_nodeset);
-
-  /* don't duplicate distances, they'll be recreated at the end of the topology build */
+  if (src->type == HWLOC_OBJ_NUMANODE && src->attr->numanode.page_types_len) {
+    len = src->attr->numanode.page_types_len * sizeof(struct hwloc_memory_page_type_s);
+    newobj->attr->numanode.page_types = hwloc_tma_malloc(tma, len);
+    memcpy(newobj->attr->numanode.page_types, src->attr->numanode.page_types, len);
+  }
 
-  for(i=0; i<src->infos_count; i++)
-    hwloc__add_info(&newobj->infos, &newobj->infos_count, src->infos[i].name, src->infos[i].value);
-}
+  newobj->cpuset = hwloc_bitmap_tma_dup(tma, src->cpuset);
+  newobj->complete_cpuset = hwloc_bitmap_tma_dup(tma, src->complete_cpuset);
+  newobj->nodeset = hwloc_bitmap_tma_dup(tma, src->nodeset);
+  newobj->complete_nodeset = hwloc_bitmap_tma_dup(tma, src->complete_nodeset);
+
+  hwloc__tma_dup_infos(tma, newobj, src);
+
+  /* find our level */
+  if (src->depth < 0) {
+    i = HWLOC_SLEVEL_FROM_DEPTH(src->depth);
+    level = newtopology->slevels[i].objs;
+    level_width = newtopology->slevels[i].nbobjs;
+    /* deal with first/last pointers of special levels, even if not really needed */
+    if (!newobj->logical_index)
+      newtopology->slevels[i].first = newobj;
+    if (newobj->logical_index == newtopology->slevels[i].nbobjs - 1)
+      newtopology->slevels[i].last = newobj;
+  } else {
+    level = newtopology->levels[src->depth];
+    level_width = newtopology->level_nbobjects[src->depth];
+  }
+  /* place us for real */
+  assert(newobj->logical_index < level_width);
+  level[newobj->logical_index] = newobj;
+  /* link to already-inserted cousins */
+  if (newobj->logical_index > 0 && level[newobj->logical_index-1]) {
+    newobj->prev_cousin = level[newobj->logical_index-1];
+    level[newobj->logical_index-1]->next_cousin = newobj;
+  }
+  if (newobj->logical_index < level_width-1 && level[newobj->logical_index+1]) {
+    newobj->next_cousin = level[newobj->logical_index+1];
+    level[newobj->logical_index+1]->prev_cousin = newobj;
+  }
 
-void
-hwloc__duplicate_objects(struct hwloc_topology *newtopology,
-			 struct hwloc_obj *newparent,
-			 struct hwloc_obj *src)
-{
-  hwloc_obj_t newobj;
-  hwloc_obj_t child;
+  /* prepare for children */
+  if (src->arity) {
+    newobj->children = hwloc_tma_malloc(tma, src->arity * sizeof(*newobj->children));
+    if (!newobj->children)
+      return -1;
+  }
+  newobj->arity = src->arity;
+  newobj->memory_arity = src->memory_arity;
+  newobj->io_arity = src->io_arity;
+  newobj->misc_arity = src->misc_arity;
+
+  /* actually insert children now */
+  for_each_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
+  for_each_memory_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      return err;
+  }
+  for_each_io_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
+  for_each_misc_child(child, src) {
+    err = hwloc__duplicate_object(newtopology, newobj, NULL, child);
+    if (err < 0)
+      goto out_with_children;
+  }
 
-  newobj = hwloc_alloc_setup_object(src->type, src->os_index);
-  hwloc__duplicate_object(newobj, src);
+ out_with_children:
 
-  for(child = src->first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(newtopology, newobj, child);
-  for(child = src->io_first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(newtopology, newobj, child);
-  for(child = src->misc_first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(newtopology, newobj, child);
+  /* link children if all of them where inserted */
+  if (!err) {
+    /* only next_sibling is set by insert_by_parent().
+     * sibling_rank was set above.
+     */
+    if (newobj->arity) {
+      newobj->children[0]->prev_sibling = NULL;
+      for(i=1; i<newobj->arity; i++)
+	newobj->children[i]->prev_sibling = newobj->children[i-1];
+      newobj->last_child = newobj->children[newobj->arity-1];
+    }
+    if (newobj->memory_arity) {
+      child = newobj->memory_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+    if (newobj->io_arity) {
+      child = newobj->io_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+    if (newobj->misc_arity) {
+      child = newobj->misc_first_child;
+      prev = NULL;
+      while (child) {
+	child->prev_sibling = prev;
+	prev = child;
+	child = child->next_sibling;
+      }
+    }
+  }
 
-  /* no need to check the children order here, the source topology
-   * is supposed to be OK already, and we have debug asserts.
+  /* some children insertion may have failed, but some children may have been inserted below us already.
+   * keep inserting ourself and let the caller clean the entire tree if we return an error.
    */
-  hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+
+  if (newparent) {
+    /* no need to check the children insert order here, the source topology
+     * is supposed to be OK already, and we have debug asserts.
+     */
+    hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+
+    /* place us inside our parent children array */
+    if (hwloc__obj_type_is_normal(newobj->type))
+      newparent->children[newobj->sibling_rank] = newobj;
+  }
+
+  return err;
 }
 
+static int
+hwloc__topology_init (struct hwloc_topology **topologyp, unsigned nblevels, struct hwloc_tma *tma);
+
+/* This function may use a tma, it cannot free() or realloc() */
 int
-hwloc_topology_dup(hwloc_topology_t *newp,
-		   hwloc_topology_t old)
+hwloc__topology_dup(hwloc_topology_t *newp,
+		    hwloc_topology_t old,
+		    struct hwloc_tma *tma)
 {
   hwloc_topology_t new;
   hwloc_obj_t newroot;
   hwloc_obj_t oldroot = hwloc_get_root_obj(old);
-  hwloc_obj_t child;
+  unsigned i;
+  int err;
 
   if (!old->is_loaded) {
-    errno = -EINVAL;
+    errno = EINVAL;
     return -1;
   }
 
-  hwloc_topology_init(&new);
+  err = hwloc__topology_init(&new, old->nb_levels_allocated, tma);
+  if (err < 0)
+    goto out;
 
   new->flags = old->flags;
-  memcpy(new->ignored_types, old->ignored_types, sizeof(old->ignored_types));
+  memcpy(new->type_filter, old->type_filter, sizeof(old->type_filter));
   new->is_thissystem = old->is_thissystem;
   new->is_loaded = 1;
   new->pid = old->pid;
+  new->next_gp_index = old->next_gp_index;
 
   memcpy(&new->binding_hooks, &old->binding_hooks, sizeof(old->binding_hooks));
 
@@ -611,57 +916,49 @@ hwloc_topology_dup(hwloc_topology_t *newp,
   memcpy(new->support.cpubind, old->support.cpubind, sizeof(*old->support.cpubind));
   memcpy(new->support.membind, old->support.membind, sizeof(*old->support.membind));
 
+  new->allowed_cpuset = hwloc_bitmap_tma_dup(tma, old->allowed_cpuset);
+  new->allowed_nodeset = hwloc_bitmap_tma_dup(tma, old->allowed_nodeset);
+
   new->userdata_export_cb = old->userdata_export_cb;
   new->userdata_import_cb = old->userdata_import_cb;
+  new->userdata_not_decoded = old->userdata_not_decoded;
 
-  newroot = hwloc_get_root_obj(new);
-  hwloc__duplicate_object(newroot, oldroot);
-
-  for(child = oldroot->first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(new, newroot, child);
-  for(child = oldroot->io_first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(new, newroot, child);
-  for(child = oldroot->misc_first_child; child; child = child->next_sibling)
-    hwloc__duplicate_objects(new, newroot, child);
-
-  if (old->first_osdist) {
-    struct hwloc_os_distances_s *olddist = old->first_osdist;
-    while (olddist) {
-      struct hwloc_os_distances_s *newdist = malloc(sizeof(*newdist));
-      newdist->type = olddist->type;
-      newdist->nbobjs = olddist->nbobjs;
-      newdist->indexes = malloc(newdist->nbobjs * sizeof(*newdist->indexes));
-      memcpy(newdist->indexes, olddist->indexes, newdist->nbobjs * sizeof(*newdist->indexes));
-      newdist->objs = NULL; /* will be recomputed when needed */
-      newdist->distances = malloc(newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
-      memcpy(newdist->distances, olddist->distances, newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
-
-      newdist->forced = olddist->forced;
-      if (new->first_osdist) {
-	new->last_osdist->next = newdist;
-	newdist->prev = new->last_osdist;
-      } else {
-	new->first_osdist = newdist;
-	newdist->prev = NULL;
-      }
-      new->last_osdist = newdist;
-      newdist->next = NULL;
+  assert(!old->machine_memory.local_memory);
+  assert(!old->machine_memory.page_types_len);
+  assert(!old->machine_memory.page_types);
 
-      olddist = olddist->next;
-    }
-  } else
-    new->first_osdist = old->last_osdist = NULL;
+  for(i = HWLOC_OBJ_TYPE_MIN; i < HWLOC_OBJ_TYPE_MAX; i++)
+    new->type_depth[i] = old->type_depth[i];
 
-  /* no need to duplicate backends, topology is already loaded */
-  new->backends = NULL;
+  /* duplicate levels and we'll place objects there when duplicating objects */
+  new->nb_levels = old->nb_levels;
+  assert(new->nb_levels_allocated >= new->nb_levels);
+  for(i=1 /* root level already allocated */ ; i<new->nb_levels; i++) {
+    new->level_nbobjects[i] = old->level_nbobjects[i];
+    new->levels[i] = hwloc_tma_calloc(tma, new->level_nbobjects[i] * sizeof(*new->levels[i]));
+  }
+  for(i=0; i<HWLOC_NR_SLEVELS; i++) {
+    new->slevels[i].nbobjs = old->slevels[i].nbobjs;
+    if (new->slevels[i].nbobjs)
+      new->slevels[i].objs = hwloc_tma_calloc(tma, new->slevels[i].nbobjs * sizeof(*new->slevels[i].objs));
+  }
 
-  hwloc_connect_children(new->levels[0][0]);
-  if (hwloc_connect_levels(new) < 0)
-    goto out;
+  /* recursively duplicate object children */
+  newroot = hwloc_get_root_obj(new);
+  err = hwloc__duplicate_object(new, NULL, newroot, oldroot);
+  if (err < 0)
+    goto out_with_topology;
+
+  err = hwloc_internal_distances_dup(new, old);
+  if (err < 0)
+    goto out_with_topology;
+
+  /* we connected everything during duplication */
   new->modified = 0;
 
-  hwloc_distances_finalize_os(new);
-  hwloc_distances_finalize_logical(new);
+  /* no need to duplicate backends, topology is already loaded */
+  new->backends = NULL;
+  new->get_pci_busid_cpuset_backend = NULL;
 
 #ifndef HWLOC_DEBUG
   if (getenv("HWLOC_DEBUG_CHECK"))
@@ -671,26 +968,19 @@ hwloc_topology_dup(hwloc_topology_t *newp,
   *newp = new;
   return 0;
 
+ out_with_topology:
+  assert(!tma || !tma->dontfree); /* this tma cannot fail to allocate */
+  hwloc_topology_destroy(new);
  out:
-  hwloc_topology_clear(new);
-  hwloc_distances_destroy(new);
-  hwloc_topology_setup_defaults(new);
   return -1;
 }
 
-/*
- * How to compare objects based on types.
- *
- * Note that HIGHER/LOWER is only a (consistent) heuristic, used to sort
- * objects with same cpuset consistently.
- * Only EQUAL / not EQUAL can be relied upon.
- */
-
-enum hwloc_type_cmp_e {
-  HWLOC_TYPE_HIGHER,
-  HWLOC_TYPE_DEEPER,
-  HWLOC_TYPE_EQUAL
-};
+int
+hwloc_topology_dup(hwloc_topology_t *newp,
+		   hwloc_topology_t old)
+{
+  return hwloc__topology_dup(newp, old, NULL);
+}
 
 /* WARNING: The indexes of this array MUST match the ordering that of
    the obj_order_type[] array, below.  Specifically, the values must
@@ -708,98 +998,120 @@ enum hwloc_type_cmp_e {
    We can't use C99 syntax to initialize this in a little safer manner
    -- bummer.  :-(
 
-   *************************************************************
-   *** DO NOT CHANGE THE ORDERING OF THIS ARRAY WITHOUT TRIPLE
-   *** CHECKING ITS CORRECTNESS!
-   *************************************************************
+   Correctness is asserted in hwloc_topology_init() when debug is enabled.
    */
+/***** Make sure you update obj_type_priority[] below as well. *****/
 static const unsigned obj_type_order[] = {
-    /* first entry is HWLOC_OBJ_SYSTEM */  0,
-    /* next entry is HWLOC_OBJ_MACHINE */  1,
-    /* next entry is HWLOC_OBJ_NUMANODE */ 3,
+    /* first entry is HWLOC_OBJ_MACHINE */  0,
     /* next entry is HWLOC_OBJ_PACKAGE */  4,
-    /* next entry is HWLOC_OBJ_CACHE */    5,
-    /* next entry is HWLOC_OBJ_CORE */     6,
-    /* next entry is HWLOC_OBJ_PU */       10,
-    /* next entry is HWLOC_OBJ_GROUP */    2,
-    /* next entry is HWLOC_OBJ_MISC */     11,
-    /* next entry is HWLOC_OBJ_BRIDGE */   7,
-    /* next entry is HWLOC_OBJ_PCI_DEVICE */  8,
-    /* next entry is HWLOC_OBJ_OS_DEVICE */   9
+    /* next entry is HWLOC_OBJ_CORE */     14,
+    /* next entry is HWLOC_OBJ_PU */       18,
+    /* next entry is HWLOC_OBJ_L1CACHE */  12,
+    /* next entry is HWLOC_OBJ_L2CACHE */  10,
+    /* next entry is HWLOC_OBJ_L3CACHE */  8,
+    /* next entry is HWLOC_OBJ_L4CACHE */  7,
+    /* next entry is HWLOC_OBJ_L5CACHE */  6,
+    /* next entry is HWLOC_OBJ_L1ICACHE */ 13,
+    /* next entry is HWLOC_OBJ_L2ICACHE */ 11,
+    /* next entry is HWLOC_OBJ_L3ICACHE */ 9,
+    /* next entry is HWLOC_OBJ_GROUP */    1,
+    /* next entry is HWLOC_OBJ_NUMANODE */ 3,
+    /* next entry is HWLOC_OBJ_BRIDGE */   15,
+    /* next entry is HWLOC_OBJ_PCI_DEVICE */  16,
+    /* next entry is HWLOC_OBJ_OS_DEVICE */   17,
+    /* next entry is HWLOC_OBJ_MISC */     19,
+    /* next entry is HWLOC_OBJ_MEMCACHE */ 2,
+    /* next entry is HWLOC_OBJ_DIE */      5
 };
 
+#ifndef NDEBUG /* only used in debug check assert if !NDEBUG */
 static const hwloc_obj_type_t obj_order_type[] = {
-  HWLOC_OBJ_SYSTEM,
   HWLOC_OBJ_MACHINE,
   HWLOC_OBJ_GROUP,
+  HWLOC_OBJ_MEMCACHE,
   HWLOC_OBJ_NUMANODE,
   HWLOC_OBJ_PACKAGE,
-  HWLOC_OBJ_CACHE,
+  HWLOC_OBJ_DIE,
+  HWLOC_OBJ_L5CACHE,
+  HWLOC_OBJ_L4CACHE,
+  HWLOC_OBJ_L3CACHE,
+  HWLOC_OBJ_L3ICACHE,
+  HWLOC_OBJ_L2CACHE,
+  HWLOC_OBJ_L2ICACHE,
+  HWLOC_OBJ_L1CACHE,
+  HWLOC_OBJ_L1ICACHE,
   HWLOC_OBJ_CORE,
   HWLOC_OBJ_BRIDGE,
   HWLOC_OBJ_PCI_DEVICE,
   HWLOC_OBJ_OS_DEVICE,
   HWLOC_OBJ_PU,
-  HWLOC_OBJ_MISC,
+  HWLOC_OBJ_MISC /* Misc is always a leaf */
 };
+#endif
+/***** Make sure you update obj_type_priority[] below as well. *****/
 
 /* priority to be used when merging identical parent/children object
  * (in merge_useless_child), keep the highest priority one.
  *
- * Always keep Machine/PU/PCIDev/OSDev
- * then System/Node
+ * Always keep Machine/NUMANode/PU/PCIDev/OSDev
  * then Core
  * then Package
- * then Cache
+ * then Die
+ * then Cache,
+ * then Instruction Caches
  * then always drop Group/Misc/Bridge.
  *
  * Some type won't actually ever be involved in such merging.
  */
+/***** Make sure you update this array when changing the list of types. *****/
 static const int obj_type_priority[] = {
-  /* first entry is HWLOC_OBJ_SYSTEM */     80,
-  /* next entry is HWLOC_OBJ_MACHINE */     90,
-  /* next entry is HWLOC_OBJ_NUMANODE */    100,
+  /* first entry is HWLOC_OBJ_MACHINE */     90,
   /* next entry is HWLOC_OBJ_PACKAGE */     40,
-  /* next entry is HWLOC_OBJ_CACHE */       20,
   /* next entry is HWLOC_OBJ_CORE */        60,
   /* next entry is HWLOC_OBJ_PU */          100,
+  /* next entry is HWLOC_OBJ_L1CACHE */     20,
+  /* next entry is HWLOC_OBJ_L2CACHE */     20,
+  /* next entry is HWLOC_OBJ_L3CACHE */     20,
+  /* next entry is HWLOC_OBJ_L4CACHE */     20,
+  /* next entry is HWLOC_OBJ_L5CACHE */     20,
+  /* next entry is HWLOC_OBJ_L1ICACHE */    19,
+  /* next entry is HWLOC_OBJ_L2ICACHE */    19,
+  /* next entry is HWLOC_OBJ_L3ICACHE */    19,
   /* next entry is HWLOC_OBJ_GROUP */       0,
-  /* next entry is HWLOC_OBJ_MISC */        0,
+  /* next entry is HWLOC_OBJ_NUMANODE */    100,
   /* next entry is HWLOC_OBJ_BRIDGE */      0,
   /* next entry is HWLOC_OBJ_PCI_DEVICE */  100,
-  /* next entry is HWLOC_OBJ_OS_DEVICE */   100
+  /* next entry is HWLOC_OBJ_OS_DEVICE */   100,
+  /* next entry is HWLOC_OBJ_MISC */        0,
+  /* next entry is HWLOC_OBJ_MEMCACHE */    19,
+  /* next entry is HWLOC_OBJ_DIE */         30
 };
 
-static unsigned __hwloc_attribute_const
-hwloc_get_type_order(hwloc_obj_type_t type)
-{
-  return obj_type_order[type];
-}
-
-#if !defined(NDEBUG)
-static hwloc_obj_type_t hwloc_get_order_type(int order)
-{
-  return obj_order_type[order];
-}
-#endif
-
 int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2)
 {
-  unsigned order1 = hwloc_get_type_order(type1);
-  unsigned order2 = hwloc_get_type_order(type2);
+  unsigned order1 = obj_type_order[type1];
+  unsigned order2 = obj_type_order[type2];
 
-  /* I/O are only comparable with each others and with machine and system */
-  if (hwloc_obj_type_is_io(type1)
-      && !hwloc_obj_type_is_io(type2) && type2 != HWLOC_OBJ_SYSTEM && type2 != HWLOC_OBJ_MACHINE)
+  /* only normal objects are comparable. others are only comparable with machine */
+  if (!hwloc__obj_type_is_normal(type1)
+      && hwloc__obj_type_is_normal(type2) && type2 != HWLOC_OBJ_MACHINE)
     return HWLOC_TYPE_UNORDERED;
-  if (hwloc_obj_type_is_io(type2)
-      && !hwloc_obj_type_is_io(type1) && type1 != HWLOC_OBJ_SYSTEM && type1 != HWLOC_OBJ_MACHINE)
+  if (!hwloc__obj_type_is_normal(type2)
+      && hwloc__obj_type_is_normal(type1) && type1 != HWLOC_OBJ_MACHINE)
     return HWLOC_TYPE_UNORDERED;
 
   return order1 - order2;
 }
 
-static enum hwloc_type_cmp_e
+enum hwloc_obj_cmp_e {
+  HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL,			/**< \brief Equal */
+  HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED,		/**< \brief Strictly included into */
+  HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS,		/**< \brief Strictly contains */
+  HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS,	/**< \brief Intersects, but no inclusion! */
+  HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT		/**< \brief No intersection */
+};
+
+static enum hwloc_obj_cmp_e
 hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
 {
   hwloc_obj_type_t type1 = obj1->type;
@@ -808,68 +1120,30 @@ hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
 
   compare = hwloc_compare_types(type1, type2);
   if (compare == HWLOC_TYPE_UNORDERED)
-    return HWLOC_TYPE_EQUAL; /* we cannot do better */
+    return HWLOC_OBJ_DIFFERENT; /* we cannot do better */
   if (compare > 0)
-    return HWLOC_TYPE_DEEPER;
+    return HWLOC_OBJ_INCLUDED;
   if (compare < 0)
-    return HWLOC_TYPE_HIGHER;
-
-  /* Caches have the same types but can have different depths.  */
-  if (type1 == HWLOC_OBJ_CACHE) {
-    if (obj1->attr->cache.depth < obj2->attr->cache.depth)
-      return HWLOC_TYPE_DEEPER;
-    else if (obj1->attr->cache.depth > obj2->attr->cache.depth)
-      return HWLOC_TYPE_HIGHER;
-    else if (obj1->attr->cache.type > obj2->attr->cache.type)
-      /* consider icache deeper than dcache and dcache deeper than unified */
-      return HWLOC_TYPE_DEEPER;
-    else if (obj1->attr->cache.type < obj2->attr->cache.type)
-      /* consider icache deeper than dcache and dcache deeper than unified */
-      return HWLOC_TYPE_HIGHER;
-  }
-
-  /* Group objects have the same types but can have different depths.  */
-  if (type1 == HWLOC_OBJ_GROUP) {
-    if (obj1->attr->group.depth == (unsigned) -1
-	|| obj2->attr->group.depth == (unsigned) -1)
-      return HWLOC_TYPE_EQUAL;
-    if (obj1->attr->group.depth < obj2->attr->group.depth)
-      return HWLOC_TYPE_DEEPER;
-    else if (obj1->attr->group.depth > obj2->attr->group.depth)
-      return HWLOC_TYPE_HIGHER;
-  }
+    return HWLOC_OBJ_CONTAINS;
 
-  /* Bridges objects have the same types but can have different depths.  */
-  if (type1 == HWLOC_OBJ_BRIDGE) {
-    if (obj1->attr->bridge.depth < obj2->attr->bridge.depth)
-      return HWLOC_TYPE_DEEPER;
-    else if (obj1->attr->bridge.depth > obj2->attr->bridge.depth)
-      return HWLOC_TYPE_HIGHER;
-  }
+  if (obj1->type == HWLOC_OBJ_GROUP
+      && (obj1->attr->group.kind != obj2->attr->group.kind
+	  || obj1->attr->group.subkind != obj2->attr->group.subkind))
+    return HWLOC_OBJ_DIFFERENT; /* we cannot do better */
 
-  return HWLOC_TYPE_EQUAL;
+  return HWLOC_OBJ_EQUAL;
 }
 
 /*
  * How to compare objects based on cpusets.
  */
-
-enum hwloc_obj_cmp_e {
-  HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL,			/**< \brief Equal */
-  HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED,		/**< \brief Strictly included into */
-  HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS,		/**< \brief Strictly contains */
-  HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS,	/**< \brief Intersects, but no inclusion! */
-  HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT		/**< \brief No intersection */
-};
-
 static int
 hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
 {
   hwloc_bitmap_t set1, set2;
-  int res = HWLOC_OBJ_DIFFERENT;
 
-  assert(!hwloc_obj_type_is_special(obj1->type));
-  assert(!hwloc_obj_type_is_special(obj2->type));
+  assert(!hwloc__obj_type_is_special(obj1->type));
+  assert(!hwloc__obj_type_is_special(obj2->type));
 
   /* compare cpusets first */
   if (obj1->complete_cpuset && obj2->complete_cpuset) {
@@ -879,60 +1153,10 @@ hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
     set1 = obj1->cpuset;
     set2 = obj2->cpuset;
   }
-  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
-    res = hwloc_bitmap_compare_inclusion(set1, set2);
-    if (res == HWLOC_OBJ_INTERSECTS)
-      return HWLOC_OBJ_INTERSECTS;
-  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2))
+    return hwloc_bitmap_compare_inclusion(set1, set2);
 
-  /* then compare nodesets, and combine the results */
-  if (obj1->complete_nodeset && obj2->complete_nodeset) {
-    set1 = obj1->complete_nodeset;
-    set2 = obj2->complete_nodeset;
-  } else {
-    set1 = obj1->nodeset;
-    set2 = obj2->nodeset;
-  }
-  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
-    int noderes = hwloc_bitmap_compare_inclusion(set1, set2);
-    /* deal with conflicting cpusets/nodesets inclusions */
-    if (noderes == HWLOC_OBJ_INCLUDED) {
-      if (res == HWLOC_OBJ_CONTAINS)
-	/* contradicting order for cpusets and nodesets */
-	return HWLOC_OBJ_INTERSECTS;
-      res = HWLOC_OBJ_INCLUDED;
-
-    } else if (noderes == HWLOC_OBJ_CONTAINS) {
-      if (res == HWLOC_OBJ_INCLUDED)
-	/* contradicting order for cpusets and nodesets */
-	return HWLOC_OBJ_INTERSECTS;
-      res = HWLOC_OBJ_CONTAINS;
-
-    } else if (noderes == HWLOC_OBJ_INTERSECTS) {
-      return HWLOC_OBJ_INTERSECTS;
-
-    } else {
-      /* nodesets are different, keep the cpuset order */
-      /* FIXME: with upcoming multiple levels of NUMA, we may have to report INCLUDED or CONTAINED here */
-
-    }
-  }
-
-  return res;
-}
-
-static int
-hwloc_obj_cmp_types(hwloc_obj_t obj1, hwloc_obj_t obj2)
-{
-  /* Same sets, subsort by type to have a consistent ordering.  */
-  int typeres = hwloc_type_cmp(obj1, obj2);
-  if (typeres == HWLOC_TYPE_DEEPER)
-    return HWLOC_OBJ_INCLUDED;
-  if (typeres == HWLOC_TYPE_HIGHER)
-    return HWLOC_OBJ_CONTAINS;
-
-  /* Same sets and types!  Let's hope it's coherent.  */
-  return HWLOC_OBJ_EQUAL;
+  return HWLOC_OBJ_DIFFERENT;
 }
 
 /* Compare object cpusets based on complete_cpuset if defined (always correctly ordered),
@@ -945,8 +1169,9 @@ hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2)
 {
   if (obj1->complete_cpuset && obj2->complete_cpuset)
     return hwloc_bitmap_compare_first(obj1->complete_cpuset, obj2->complete_cpuset);
-  else
+  else if (obj1->cpuset && obj2->cpuset)
     return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset);
+  return 0;
 }
 
 /* format the obj info to print in error messages */
@@ -955,15 +1180,23 @@ hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
 {
 	char typestr[64];
 	char *cpusetstr;
+	char *nodesetstr = NULL;
 	hwloc_obj_type_snprintf(typestr, sizeof(typestr), obj, 0);
 	hwloc_bitmap_asprintf(&cpusetstr, obj->cpuset);
-	if (obj->os_index != (unsigned) -1)
-	  snprintf(buf, buflen, "%s (P#%u cpuset %s)",
-		   typestr, obj->os_index, cpusetstr);
+	if (obj->nodeset) /* may be missing during insert */
+	  hwloc_bitmap_asprintf(&nodesetstr, obj->nodeset);
+	if (obj->os_index != HWLOC_UNKNOWN_INDEX)
+	  snprintf(buf, buflen, "%s (P#%u cpuset %s%s%s)",
+		   typestr, obj->os_index, cpusetstr,
+		   nodesetstr ? " nodeset " : "",
+		   nodesetstr ? nodesetstr : "");
 	else
-	  snprintf(buf, buflen, "%s (cpuset %s)",
-		   typestr, cpusetstr);
+	  snprintf(buf, buflen, "%s (cpuset %s%s%s)",
+		   typestr, cpusetstr,
+		   nodesetstr ? " nodeset " : "",
+		   nodesetstr ? nodesetstr : "");
 	free(cpusetstr);
+	free(nodesetstr);
 }
 
 /*
@@ -974,50 +1207,29 @@ hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
  * complete.
  */
 
-#define merge_index(new, old, field, type) \
-  if ((old)->field == (type) -1) \
-    (old)->field = (new)->field;
-#define merge_sizes(new, old, field) \
-  if (!(old)->field) \
-    (old)->field = (new)->field;
-#ifdef HWLOC_DEBUG
-#define check_sizes(new, old, field) \
-  if ((new)->field) \
-    assert((old)->field == (new)->field)
-#else
-#define check_sizes(new, old, field)
-#endif
-
+/* merge new object attributes in old.
+ * use old if defined, otherwise use new.
+ */
 static void
 merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
 {
-  merge_index(new, old, os_index, unsigned);
-
-  if (new->distances_count) {
-    if (old->distances_count) {
-      old->distances_count += new->distances_count;
-      old->distances = realloc(old->distances, old->distances_count * sizeof(*old->distances));
-      memcpy(old->distances + new->distances_count, new->distances, new->distances_count * sizeof(*old->distances));
-      free(new->distances);
-    } else {
-      old->distances_count = new->distances_count;
-      old->distances = new->distances;
-    }
-    new->distances_count = 0;
-    new->distances = NULL;
-  }
+  if (old->os_index == HWLOC_UNKNOWN_INDEX)
+    old->os_index = new->os_index;
 
   if (new->infos_count) {
+    /* FIXME: dedup */
     hwloc__move_infos(&old->infos, &old->infos_count,
 		      &new->infos, &new->infos_count);
   }
 
-  if (new->name) {
-    if (old->name)
-      free(old->name);
+  if (new->name && !old->name) {
     old->name = new->name;
     new->name = NULL;
   }
+  if (new->subtype && !old->subtype) {
+    old->subtype = new->subtype;
+    new->subtype = NULL;
+  }
 
   /* Ignore userdata. It will be NULL before load().
    * It may be non-NULL if alloc+insert_group() after load().
@@ -1025,34 +1237,98 @@ merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
 
   switch(new->type) {
   case HWLOC_OBJ_NUMANODE:
-    /* Do not check these, it may change between calls */
-    merge_sizes(new, old, memory.local_memory);
-    merge_sizes(new, old, memory.total_memory);
-    /* if both newects have a page_types array, just keep the biggest one for now */
-    if (new->memory.page_types_len && old->memory.page_types_len)
-      hwloc_debug("%s", "merging page_types by keeping the biggest one only\n");
-    if (new->memory.page_types_len < old->memory.page_types_len) {
-      free(new->memory.page_types);
-    } else {
-      free(old->memory.page_types);
-      old->memory.page_types_len = new->memory.page_types_len;
-      old->memory.page_types = new->memory.page_types;
-      new->memory.page_types = NULL;
-      new->memory.page_types_len = 0;
+    if (new->attr->numanode.local_memory && !old->attr->numanode.local_memory) {
+      /* no memory in old, use new memory */
+      old->attr->numanode.local_memory = new->attr->numanode.local_memory;
+      free(old->attr->numanode.page_types);
+      old->attr->numanode.page_types_len = new->attr->numanode.page_types_len;
+      old->attr->numanode.page_types = new->attr->numanode.page_types;
+      new->attr->numanode.page_types = NULL;
+      new->attr->numanode.page_types_len = 0;
     }
+    /* old->attr->numanode.total_memory will be updated by propagate_total_memory() */
     break;
-  case HWLOC_OBJ_CACHE:
-    merge_sizes(new, old, attr->cache.size);
-    check_sizes(new, old, attr->cache.size);
-    merge_sizes(new, old, attr->cache.linesize);
-    check_sizes(new, old, attr->cache.linesize);
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+    if (!old->attr->cache.size)
+      old->attr->cache.size = new->attr->cache.size;
+    if (!old->attr->cache.linesize)
+      old->attr->cache.size = new->attr->cache.linesize;
+    if (!old->attr->cache.associativity)
+      old->attr->cache.size = new->attr->cache.linesize;
     break;
   default:
     break;
   }
 }
 
-/* Try to insert OBJ in CUR, recurse if needed.
+/* returns the result of merge, or NULL if not merged */
+static __hwloc_inline hwloc_obj_t
+hwloc__insert_try_merge_group(hwloc_obj_t old, hwloc_obj_t new)
+{
+  if (new->type == HWLOC_OBJ_GROUP && old->type == HWLOC_OBJ_GROUP) {
+    /* which group do we keep? */
+    if (new->attr->group.dont_merge) {
+      if (old->attr->group.dont_merge)
+	/* nobody wants to be merged */
+	return NULL;
+
+      /* keep the new one, it doesn't want to be merged */
+      hwloc_replace_linked_object(old, new);
+      return new;
+
+    } else {
+      if (old->attr->group.dont_merge)
+	/* keep the old one, it doesn't want to be merged */
+	return old;
+
+      /* compare subkinds to decice who to keep */
+      if (new->attr->group.kind < old->attr->group.kind)
+	hwloc_replace_linked_object(old, new);
+      return old;
+    }
+  }
+
+  if (new->type == HWLOC_OBJ_GROUP && !new->attr->group.dont_merge) {
+
+    if (old->type == HWLOC_OBJ_PU && new->attr->group.kind == HWLOC_GROUP_KIND_MEMORY)
+      /* Never merge Memory groups with PU, we don't want to attach Memory under PU */
+      return NULL;
+
+    /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
+     * while some callers need to know (at least hwloc_topology_insert_group()).
+     */
+    return old;
+
+  } else if (old->type == HWLOC_OBJ_GROUP && !old->attr->group.dont_merge) {
+
+    if (new->type == HWLOC_OBJ_PU && old->attr->group.kind == HWLOC_GROUP_KIND_MEMORY)
+      /* Never merge Memory groups with PU, we don't want to attach Memory under PU */
+      return NULL;
+
+    /* Replace the Group with the new object contents
+     * and let the caller free the new object
+     */
+    hwloc_replace_linked_object(old, new);
+    return old;
+
+  } else {
+    /* cannot merge */
+    return NULL;
+  }
+}
+
+/*
+ * The main insertion routine, only used for CPU-side object (normal types)
+ * uisng cpuset only (or complete_cpuset).
+ *
+ * Try to insert OBJ in CUR, recurse if needed.
  * Returns the object if it was inserted,
  * the remaining object it was merged,
  * NULL if failed to insert.
@@ -1068,11 +1344,7 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur
   /* Pointer where OBJ should be put */
   hwloc_obj_t *putp = NULL; /* OBJ position isn't found yet */
 
-  /* Make sure we haven't gone too deep.  */
-  if (!hwloc_bitmap_isincluded(obj->cpuset, cur->cpuset)) {
-    fprintf(stderr,"recursion has gone too deep?!\n");
-    return NULL;
-  }
+  assert(!hwloc__obj_type_is_memory(obj->type));
 
   /* Iteration with prefetching to be completely safe against CHILD removal.
    * The list is already sorted by cpuset, and there's no intersection between siblings.
@@ -1082,36 +1354,21 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur
        child = next_child, child ? next_child = child->next_sibling : NULL) {
 
     int res = hwloc_obj_cmp_sets(obj, child);
+    int setres = res;
 
     if (res == HWLOC_OBJ_EQUAL) {
-      if (obj->type == HWLOC_OBJ_GROUP) {
-	/* Group are ignored keep_structure. ignored always are handled earlier. Non-ignored Groups isn't possible. */
-	assert(topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE);
-        /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
-	 * while some callers need to know (at least hwloc_topology_insert_group()).
-	 *
-	 * Keep EQUAL so that the Group gets merged.
-	 */
-      } else {
-	/* otherwise compare actual types to decide of the inclusion */
-	res = hwloc_obj_cmp_types(obj, child);
-      }
+      hwloc_obj_t merged = hwloc__insert_try_merge_group(child, obj);
+      if (merged)
+	return merged;
+      /* otherwise compare actual types to decide of the inclusion */
+      res = hwloc_type_cmp(obj, child);
     }
 
     switch (res) {
       case HWLOC_OBJ_EQUAL:
-	/* Can be two objects with same type. Or one Group and anything else. */
-	if (obj->type == child->type
-	    && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)
-	    && obj->os_index != child->os_index) {
-	  static int reported = 0;
-	  if (!reported && !hwloc_hide_errors()) {
-	    fprintf(stderr, "Cannot merge similar %s objects with different OS indexes %u and %u\n",
-		    hwloc_obj_type_string(obj->type), child->os_index, obj->os_index);
-	    reported = 1;
-	  }
-          return NULL;
-	}
+	/* Two objects with same type.
+	 * Groups are handled above.
+	 */
 	merge_insert_equal(obj, child);
 	/* Already present, no need to insert.  */
 	return child;
@@ -1124,7 +1381,7 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur
         if (report_error) {
 	  char childstr[512];
 	  char objstr[512];
-	  char msg[1024];
+	  char msg[1100];
 	  hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
 	  hwloc__report_error_format_obj(childstr, sizeof(childstr), child);
 	  snprintf(msg, sizeof(msg), "%s intersects with %s without inclusion!", objstr, childstr);
@@ -1149,6 +1406,10 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur
 	*obj_children = child;
 	obj_children = &child->next_sibling;
 	child->parent = obj;
+	if (setres == HWLOC_OBJ_EQUAL) {
+	  obj->memory_first_child = child->memory_first_child;
+	  child->memory_first_child = NULL;
+	}
 	break;
     }
   }
@@ -1186,25 +1447,267 @@ hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur
   return NULL;
 }
 
+/* this differs from hwloc_get_obj_covering_cpuset() by:
+ * - not looking at the parent cpuset first, which means we can insert
+ *   below root even if root PU bits are not set yet (PU are inserted later).
+ * - returning the first child that exactly matches instead of walking down in case
+ *   of identical children.
+ */
+static struct hwloc_obj *
+hwloc__find_obj_covering_memory_cpuset(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_bitmap_t cpuset)
+{
+  hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, cpuset, parent);
+  if (!child)
+    return parent;
+  if (child && hwloc_bitmap_isequal(child->cpuset, cpuset))
+    return child;
+  return hwloc__find_obj_covering_memory_cpuset(topology, child, cpuset);
+}
+
+static struct hwloc_obj *
+hwloc__find_insert_memory_parent(struct hwloc_topology *topology, hwloc_obj_t obj,
+				 hwloc_report_error_t report_error)
+{
+  hwloc_obj_t parent, group, result;
+
+  if (hwloc_bitmap_iszero(obj->cpuset)) {
+    /* CPU-less go in dedicated group below root */
+    parent = topology->levels[0][0];
+
+  } else {
+    /* find the highest obj covering the cpuset */
+    parent = hwloc__find_obj_covering_memory_cpuset(topology, topology->levels[0][0], obj->cpuset);
+    if (!parent) {
+      /* fallback to root */
+      parent = hwloc_get_root_obj(topology);
+    }
+
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* Never attach to PU, try parent */
+      parent = parent->parent;
+      assert(parent);
+    }
+
+    /* TODO: if root->cpuset was updated earlier, we would be sure whether the group will remain identical to root */
+    if (parent != topology->levels[0][0] && hwloc_bitmap_isequal(parent->cpuset, obj->cpuset))
+      /* that parent is fine */
+      return parent;
+  }
+
+  if (!hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP))
+    /* even if parent isn't perfect, we don't want an intermediate group */
+    return parent;
+
+  /* need to insert an intermediate group for attaching the NUMA node */
+  group = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+  if (!group)
+    /* failed to create the group, fallback to larger parent */
+    return parent;
+
+  group->attr->group.kind = HWLOC_GROUP_KIND_MEMORY;
+  group->cpuset = hwloc_bitmap_dup(obj->cpuset);
+  group->complete_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+  /* we could duplicate nodesets too but hwloc__insert_object_by_cpuset()
+   * doesn't actually need it. and it could prevent future calls from reusing
+   * that groups for other NUMA nodes.
+   */
+  if (!group->cpuset != !obj->cpuset
+      || !group->complete_cpuset != !obj->complete_cpuset) {
+    /* failed to create the group, fallback to larger parent */
+    hwloc_free_unlinked_object(group);
+    return parent;
+  }
+
+  result = hwloc__insert_object_by_cpuset(topology, parent, group, report_error);
+  if (!result) {
+    /* failed to insert, fallback to larger parent */
+    return parent;
+  }
+
+  assert(result == group);
+  return group;
+}
+
+/* only works for MEMCACHE and NUMAnode with a single bit in nodeset */
+static hwloc_obj_t
+hwloc___attach_memory_object_by_nodeset(struct hwloc_topology *topology, hwloc_obj_t parent,
+					hwloc_obj_t obj,
+					hwloc_report_error_t report_error)
+{
+  hwloc_obj_t *curp = &parent->memory_first_child;
+  unsigned first = hwloc_bitmap_first(obj->nodeset);
+
+  while (*curp) {
+    hwloc_obj_t cur = *curp;
+    unsigned curfirst = hwloc_bitmap_first(cur->nodeset);
+
+    if (first < curfirst) {
+      /* insert before cur */
+      obj->next_sibling = cur;
+      *curp = obj;
+      obj->memory_first_child = NULL;
+      obj->parent = parent;
+      topology->modified = 1;
+      return obj;
+    }
+
+    if (first == curfirst) {
+      /* identical nodeset */
+      if (obj->type == HWLOC_OBJ_NUMANODE) {
+	if (cur->type == HWLOC_OBJ_NUMANODE) {
+	  /* identical NUMA nodes? ignore the new one */
+	  if (report_error) {
+	    char curstr[512];
+	    char objstr[512];
+	    char msg[1100];
+	    hwloc__report_error_format_obj(curstr, sizeof(curstr), cur);
+	    hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
+	    snprintf(msg, sizeof(msg), "%s and %s have identical nodesets!", objstr, curstr);
+	    report_error(msg, __LINE__);
+	  }
+	  return NULL;
+	}
+	assert(cur->type == HWLOC_OBJ_MEMCACHE);
+	/* insert the new NUMA node below that existing memcache */
+	return hwloc___attach_memory_object_by_nodeset(topology, cur, obj, report_error);
+
+      } else {
+	assert(obj->type == HWLOC_OBJ_MEMCACHE);
+	if (cur->type == HWLOC_OBJ_MEMCACHE) {
+	  if (cur->attr->cache.depth == obj->attr->cache.depth)
+	    /* memcache with same nodeset and depth, ignore the new one */
+	    return NULL;
+	  if (cur->attr->cache.depth > obj->attr->cache.depth)
+	    /* memcache with higher cache depth is actually *higher* in the hierarchy
+	     * (depth starts from the NUMA node).
+	     * insert the new memcache below the existing one
+	     */
+	    return hwloc___attach_memory_object_by_nodeset(topology, cur, obj, report_error);
+	}
+	/* insert the memcache above the existing memcache or numa node */
+	obj->next_sibling = cur->next_sibling;
+	cur->next_sibling = NULL;
+	obj->memory_first_child = cur;
+	cur->parent = obj;
+	*curp = obj;
+	obj->parent = parent;
+	topology->modified = 1;
+	return obj;
+      }
+    }
+
+    curp = &cur->next_sibling;
+  }
+
+  /* append to the end of the list */
+  obj->next_sibling = NULL;
+  *curp = obj;
+  obj->memory_first_child = NULL;
+  obj->parent = parent;
+  topology->modified = 1;
+  return obj;
+}
+
+/* Attach the given memory object below the given normal parent.
+ *
+ * Only the nodeset is used to find the location inside memory children below parent.
+ *
+ * Nodeset inclusion inside the given memory hierarchy is guaranteed by this function,
+ * but nodesets are not propagated to CPU-side parent yet. It will be done by
+ * propagate_nodeset() later.
+ */
+struct hwloc_obj *
+hwloc__attach_memory_object(struct hwloc_topology *topology, hwloc_obj_t parent,
+			    hwloc_obj_t obj,
+			    hwloc_report_error_t report_error)
+{
+  hwloc_obj_t result;
+
+  assert(parent);
+  assert(hwloc__obj_type_is_normal(parent->type));
+
+  /* Check the nodeset */
+  if (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+    return NULL;
+  /* Initialize or check the complete nodeset */
+  if (!obj->complete_nodeset) {
+    obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+  } else if (!hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset)) {
+    return NULL;
+  }
+  /* Neither ACPI nor Linux support multinode mscache */
+  assert(hwloc_bitmap_weight(obj->nodeset) == 1);
+
+#if 0
+  /* TODO: enable this instead of hack in fixup_sets once NUMA nodes are inserted late */
+  /* copy the parent cpuset in case it's larger than expected.
+   * we could also keep the cpuset smaller than the parent and say that a normal-parent
+   * can have multiple memory children with smaller cpusets.
+   * However, the user decided the ignore Groups, so hierarchy/locality loss is expected.
+   */
+  hwloc_bitmap_copy(obj->cpuset, parent->cpuset);
+  hwloc_bitmap_copy(obj->complete_cpuset, parent->complete_cpuset);
+#endif
+
+  result = hwloc___attach_memory_object_by_nodeset(topology, parent, obj, report_error);
+  if (result == obj) {
+    /* Add the bit to the top sets, and to the parent CPU-side object */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      hwloc_bitmap_set(topology->levels[0][0]->nodeset, obj->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_nodeset, obj->os_index);
+    }
+  }
+  if (result != obj) {
+    /* either failed to insert, or got merged, free the original object */
+    hwloc_free_unlinked_object(obj);
+  }
+  return result;
+}
+
 /* insertion routine that lets you change the error reporting callback */
 struct hwloc_obj *
-hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj,
+hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t root,
+			       hwloc_obj_t obj,
 			       hwloc_report_error_t report_error)
 {
   struct hwloc_obj *result;
 
-  assert(!hwloc_obj_type_is_special(obj->type));
+#ifdef HWLOC_DEBUG
+  assert(!hwloc__obj_type_is_special(obj->type));
+
+  /* we need at least one non-NULL set (normal or complete, cpuset or nodeset) */
+  assert(obj->cpuset || obj->complete_cpuset || obj->nodeset || obj->complete_nodeset);
+  /* we support the case where all of them are empty.
+   * it may happen when hwloc__find_insert_memory_parent()
+   * inserts a Group for a CPU-less NUMA-node.
+   */
+#endif
+
+  if (hwloc__obj_type_is_memory(obj->type)) {
+    if (!root) {
+      root = hwloc__find_insert_memory_parent(topology, obj, report_error);
+      if (!root) {
+	hwloc_free_unlinked_object(obj);
+	return NULL;
+      }
+    }
+    return hwloc__attach_memory_object(topology, root, obj, report_error);
+  }
 
-  /* Start at the top.  */
-  result = hwloc___insert_object_by_cpuset(topology, topology->levels[0][0], obj, report_error);
+  if (!root)
+    /* Start at the top. */
+    root = topology->levels[0][0];
+
+  result = hwloc___insert_object_by_cpuset(topology, root, obj, report_error);
+  if (result && result->type == HWLOC_OBJ_PU) {
+      /* Add the bit to the top sets */
+      if (hwloc_bitmap_isset(result->cpuset, result->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->cpuset, result->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, result->os_index);
+  }
   if (result != obj) {
     /* either failed to insert, or got merged, free the original object */
     hwloc_free_unlinked_object(obj);
-  } else {
-    /* Add the cpuset to the top */
-    hwloc_bitmap_or(topology->levels[0][0]->complete_cpuset, topology->levels[0][0]->complete_cpuset, obj->cpuset);
-    if (obj->nodeset)
-      hwloc_bitmap_or(topology->levels[0][0]->complete_nodeset, topology->levels[0][0]->complete_nodeset, obj->nodeset);
   }
   return result;
 }
@@ -1214,7 +1717,7 @@ hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj,
 struct hwloc_obj *
 hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj)
 {
-  return hwloc__insert_object_by_cpuset(topology, obj, hwloc_report_os_error);
+  return hwloc__insert_object_by_cpuset(topology, NULL, obj, hwloc_report_os_error);
 }
 
 void
@@ -1225,9 +1728,18 @@ hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t paren
   if (obj->type == HWLOC_OBJ_MISC) {
     /* Append to the end of the Misc list */
     for (current = &parent->misc_first_child; *current; current = &(*current)->next_sibling);
-  } else if (hwloc_obj_type_is_io(obj->type)) {
+  } else if (hwloc__obj_type_is_io(obj->type)) {
     /* Append to the end of the I/O list */
     for (current = &parent->io_first_child; *current; current = &(*current)->next_sibling);
+  } else if (hwloc__obj_type_is_memory(obj->type)) {
+    /* Append to the end of the memory list */
+    for (current = &parent->memory_first_child; *current; current = &(*current)->next_sibling);
+    /* Add the bit to the top sets */
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      if (hwloc_bitmap_isset(obj->nodeset, obj->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->nodeset, obj->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_nodeset, obj->os_index);
+    }
   } else {
     /* Append to the end of the list.
      * The caller takes care of inserting children in the right cpuset order, without intersection between them.
@@ -1236,6 +1748,12 @@ hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t paren
      * Other callers just insert random objects such as I/O or Misc, no cpuset issue there.
      */
     for (current = &parent->first_child; *current; current = &(*current)->next_sibling);
+    /* Add the bit to the top sets */
+    if (obj->type == HWLOC_OBJ_PU) {
+      if (hwloc_bitmap_isset(obj->cpuset, obj->os_index))
+	hwloc_bitmap_set(topology->levels[0][0]->cpuset, obj->os_index);
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, obj->os_index);
+    }
   }
 
   *current = obj;
@@ -1245,19 +1763,51 @@ hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t paren
 }
 
 hwloc_obj_t
-hwloc_topology_alloc_group_object(struct hwloc_topology *topology __hwloc_attribute_unused)
+hwloc_alloc_setup_object(hwloc_topology_t topology,
+			 hwloc_obj_type_t type, unsigned os_index)
 {
-  hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+  struct hwloc_obj *obj = hwloc_tma_malloc(topology->tma, sizeof(*obj));
   if (!obj)
     return NULL;
-  obj->attr->group.depth = -1;
+  memset(obj, 0, sizeof(*obj));
+  obj->type = type;
+  obj->os_index = os_index;
+  obj->gp_index = topology->next_gp_index++;
+  obj->attr = hwloc_tma_malloc(topology->tma, sizeof(*obj->attr));
+  if (!obj->attr) {
+    assert(!topology->tma || !topology->tma->dontfree); /* this tma cannot fail to allocate */
+    free(obj);
+    return NULL;
+  }
+  memset(obj->attr, 0, sizeof(*obj->attr));
+  /* do not allocate the cpuset here, let the caller do it */
   return obj;
 }
 
+hwloc_obj_t
+hwloc_topology_alloc_group_object(struct hwloc_topology *topology)
+{
+  if (!topology->is_loaded) {
+    /* this could actually work, see insert() below */
+    errno = EINVAL;
+    return NULL;
+  }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return NULL;
+  }
+  return hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+}
+
+static void hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root);
+static void propagate_total_memory(hwloc_obj_t obj);
+static void hwloc_set_group_depth(hwloc_topology_t topology);
+
 hwloc_obj_t
 hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
 {
-  hwloc_obj_t res;
+  hwloc_obj_t res, root;
+  int cmp;
 
   if (!topology->is_loaded) {
     /* this could actually work, we would just need to disable connect_children/levels below */
@@ -1265,23 +1815,62 @@ hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t
     errno = EINVAL;
     return NULL;
   }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return NULL;
+  }
 
-  if (topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_ALWAYS) {
+  if (topology->type_filter[HWLOC_OBJ_GROUP] == HWLOC_TYPE_FILTER_KEEP_NONE) {
     hwloc_free_unlinked_object(obj);
     errno = EINVAL;
     return NULL;
   }
 
+  root = hwloc_get_root_obj(topology);
+  if (obj->cpuset)
+    hwloc_bitmap_and(obj->cpuset, obj->cpuset, root->cpuset);
+  if (obj->complete_cpuset)
+    hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, root->complete_cpuset);
+  if (obj->nodeset)
+    hwloc_bitmap_and(obj->nodeset, obj->nodeset, root->nodeset);
+  if (obj->complete_nodeset)
+    hwloc_bitmap_and(obj->complete_nodeset, obj->complete_nodeset, root->complete_nodeset);
+
   if ((!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset))
-      && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))
-      && (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
-      && (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
-    hwloc_free_unlinked_object(obj);
-    errno = EINVAL;
-    return NULL;
+      && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))) {
+    /* we'll insert by cpuset, so build cpuset from the nodeset */
+    hwloc_const_bitmap_t nodeset = obj->nodeset ? obj->nodeset : obj->complete_nodeset;
+    hwloc_obj_t numa;
+
+    if ((!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+	&& (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
+      hwloc_free_unlinked_object(obj);
+      errno = EINVAL;
+      return NULL;
+    }
+
+    if (!obj->cpuset) {
+      obj->cpuset = hwloc_bitmap_alloc();
+      if (!obj->cpuset) {
+	hwloc_free_unlinked_object(obj);
+	return NULL;
+      }
+    }
+
+    numa = NULL;
+    while ((numa = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, numa)) != NULL)
+      if (hwloc_bitmap_isset(nodeset, numa->os_index))
+	hwloc_bitmap_or(obj->cpuset, obj->cpuset, numa->cpuset);
+  }
+
+  cmp = hwloc_obj_cmp_sets(obj, root);
+  if (cmp == HWLOC_OBJ_INCLUDED) {
+    res = hwloc__insert_object_by_cpuset(topology, NULL, obj, NULL /* do not show errors on stdout */);
+  } else {
+    /* just merge root */
+    res = root;
   }
 
-  res = hwloc__insert_object_by_cpuset(topology, obj, NULL /* do not show errors on stdout */);
   if (!res)
     return NULL;
   if (res != obj)
@@ -1290,21 +1879,26 @@ hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t
 
   /* properly inserted */
   hwloc_obj_add_children_sets(obj);
-  hwloc_connect_children(topology->levels[0][0]);
-  if (hwloc_connect_levels(topology) < 0)
+  if (hwloc_topology_reconnect(topology, 0) < 0)
     return NULL;
-  topology->modified = 0;
+
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+  hwloc_set_group_depth(topology);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
   return obj;
 }
 
-static void hwloc_connect_misc_level(hwloc_topology_t topology);
-
 hwloc_obj_t
 hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t parent, const char *name)
 {
   hwloc_obj_t obj;
 
-  if (topology->ignored_types[HWLOC_OBJ_MISC] == HWLOC_IGNORE_TYPE_ALWAYS) {
+  if (topology->type_filter[HWLOC_OBJ_MISC] == HWLOC_TYPE_FILTER_KEEP_NONE) {
     errno = EINVAL;
     return NULL;
   }
@@ -1313,24 +1907,109 @@ hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t p
     errno = EINVAL;
     return NULL;
   }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return NULL;
+  }
 
-  obj = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, -1);
+  obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MISC, HWLOC_UNKNOWN_INDEX);
   if (name)
     obj->name = strdup(name);
 
   hwloc_insert_object_by_parent(topology, parent, obj);
 
-  hwloc_connect_children(parent); /* FIXME: only connect misc children */
-  hwloc_connect_misc_level(topology);
-  topology->modified = 0;
+  /* FIXME: only connect misc parent children and misc level,
+   * but this API is likely not performance critical anyway
+   */
+  hwloc_topology_reconnect(topology, 0);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
 
   return obj;
 }
 
-static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+/* assuming set is included in the topology complete_cpuset
+ * and all objects have a proper complete_cpuset,
+ * return the best one containing set.
+ * if some object are equivalent (same complete_cpuset), return the highest one.
+ */
+static hwloc_obj_t
+hwloc_get_highest_obj_covering_complete_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
 {
-  const struct hwloc_obj_memory_page_type_s *a = _a;
-  const struct hwloc_obj_memory_page_type_s *b = _b;
+  hwloc_obj_t current = hwloc_get_root_obj(topology);
+  hwloc_obj_t child;
+
+  if (hwloc_bitmap_isequal(set, current->complete_cpuset))
+    /* root cpuset is exactly what we want, no need to look at children, we want the highest */
+    return current;
+
+ recurse:
+  /* find the right child */
+  for_each_child(child, current) {
+    if (hwloc_bitmap_isequal(set, child->complete_cpuset))
+      /* child puset is exactly what we want, no need to look at children, we want the highest */
+      return child;
+    if (!hwloc_bitmap_iszero(child->complete_cpuset) && hwloc_bitmap_isincluded(set, child->complete_cpuset))
+      break;
+  }
+
+  if (child) {
+    current = child;
+    goto recurse;
+  }
+
+  /* no better child */
+  return current;
+}
+
+hwloc_obj_t
+hwloc_find_insert_io_parent_by_complete_cpuset(struct hwloc_topology *topology, hwloc_cpuset_t cpuset)
+{
+  hwloc_obj_t group_obj, largeparent, parent;
+
+  /* restrict to the existing complete cpuset to avoid errors later */
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
+  if (hwloc_bitmap_iszero(cpuset))
+    /* remaining cpuset is empty, invalid */
+    return NULL;
+
+  largeparent = hwloc_get_highest_obj_covering_complete_cpuset(topology, cpuset);
+  if (hwloc_bitmap_isequal(largeparent->complete_cpuset, cpuset)
+      || !hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP))
+    /* Found a valid object (normal case) */
+    return largeparent;
+
+  /* we need to insert an intermediate group */
+  group_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
+  if (!group_obj)
+    /* Failed to insert the exact Group, fallback to largeparent */
+    return largeparent;
+
+  group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
+  group_obj->cpuset = hwloc_bitmap_dup(cpuset);
+  group_obj->attr->group.kind = HWLOC_GROUP_KIND_IO;
+  parent = hwloc__insert_object_by_cpuset(topology, largeparent, group_obj, hwloc_report_os_error);
+  if (!parent)
+    /* Failed to insert the Group, maybe a conflicting cpuset */
+    return largeparent;
+
+  /* Group couldn't get merged or we would have gotten the right largeparent earlier */
+  assert(parent == group_obj);
+
+  /* Group inserted without being merged, everything OK, setup its sets */
+  hwloc_obj_add_children_sets(group_obj);
+
+  return parent;
+}
+
+static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+{
+  const struct hwloc_memory_page_type_s *a = _a;
+  const struct hwloc_memory_page_type_s *b = _b;
   /* consider 0 as larger so that 0-size page_type go to the end */
   if (!b->size)
     return -1;
@@ -1344,119 +2023,91 @@ static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
 static void
 propagate_total_memory(hwloc_obj_t obj)
 {
-  hwloc_obj_t *temp, child;
+  hwloc_obj_t child;
   unsigned i;
 
   /* reset total before counting local and children memory */
-  obj->memory.total_memory = 0;
+  obj->total_memory = 0;
 
   /* Propagate memory up. */
-  for_each_child_safe(child, obj, temp) {
+  for_each_child(child, obj) {
+    propagate_total_memory(child);
+    obj->total_memory += child->total_memory;
+  }
+  for_each_memory_child(child, obj) {
     propagate_total_memory(child);
-    obj->memory.total_memory += child->memory.total_memory;
+    obj->total_memory += child->total_memory;
   }
   /* No memory under I/O or Misc */
 
-  obj->memory.total_memory += obj->memory.local_memory;
-
-  /* By the way, sort the page_type array.
-   * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
-   */
-  qsort(obj->memory.page_types, obj->memory.page_types_len, sizeof(*obj->memory.page_types), hwloc_memory_page_type_compare);
-  /* Ignore 0-size page_types, they are at the end */
-  for(i=obj->memory.page_types_len; i>=1; i--)
-    if (obj->memory.page_types[i-1].size)
-      break;
-  obj->memory.page_types_len = i;
-}
-
-/* Collect the cpuset of all the PU objects. */
-static void
-collect_proc_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
-{
-  hwloc_obj_t child, *temp;
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    obj->total_memory += obj->attr->numanode.local_memory;
 
-  if (sys) {
-    /* We are already given a pointer to a system object */
-    if (obj->type == HWLOC_OBJ_PU)
-      hwloc_bitmap_or(sys->cpuset, sys->cpuset, obj->cpuset);
-  } else {
-    if (obj->cpuset) {
-      /* This object is the root of a machine */
-      sys = obj;
-      /* Assume no PU for now */
-      hwloc_bitmap_zero(obj->cpuset);
-    }
+    /* By the way, sort the page_type array.
+     * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
+     */
+    qsort(obj->attr->numanode.page_types, obj->attr->numanode.page_types_len, sizeof(*obj->attr->numanode.page_types), hwloc_memory_page_type_compare);
+    /* Ignore 0-size page_types, they are at the end */
+    for(i=obj->attr->numanode.page_types_len; i>=1; i--)
+      if (obj->attr->numanode.page_types[i-1].size)
+	break;
+    obj->attr->numanode.page_types_len = i;
   }
-
-  for_each_child_safe(child, obj, temp)
-    collect_proc_cpuset(child, sys);
-  /* No PU under I/O or Misc */
 }
 
-/* While traversing down and up, propagate the disallowed cpus by
- * and'ing them to and from the first object that has a cpuset */
+/* Now that root sets are ready, propagate them to children
+ * by allocating missing sets and restricting existing ones.
+ */
 static void
-propagate_unused_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+fixup_sets(hwloc_obj_t obj)
 {
-  hwloc_obj_t child, *temp;
-
-  if (obj->cpuset) {
-    if (sys) {
-      /* We are already given a pointer to an system object, update it and update ourselves */
-      hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+  int in_memory_list;
+  hwloc_obj_t child;
 
-      /* Apply the topology cpuset */
-      hwloc_bitmap_and(obj->cpuset, obj->cpuset, sys->cpuset);
+  child = obj->first_child;
+  in_memory_list = 0;
+  /* iterate over normal children first, we'll come back for memory children later */
 
-      /* Update complete cpuset down */
-      if (obj->complete_cpuset) {
-	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, sys->complete_cpuset);
-      } else {
-	obj->complete_cpuset = hwloc_bitmap_dup(sys->complete_cpuset);
-	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, obj->cpuset);
-      }
+  /* FIXME: if memory objects are inserted late, we should update their cpuset and complete_cpuset at insertion instead of here */
+ iterate:
+  while (child) {
+    /* our cpuset must be included in our parent's one */
+    hwloc_bitmap_and(child->cpuset, child->cpuset, obj->cpuset);
+    hwloc_bitmap_and(child->nodeset, child->nodeset, obj->nodeset);
+    /* our complete_cpuset must be included in our parent's one, but can be larger than our cpuset */
+    if (child->complete_cpuset) {
+      hwloc_bitmap_and(child->complete_cpuset, child->complete_cpuset, obj->complete_cpuset);
+    } else {
+      child->complete_cpuset = hwloc_bitmap_dup(child->cpuset);
+    }
+    if (child->complete_nodeset) {
+      hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
+    } else {
+      child->complete_nodeset = hwloc_bitmap_dup(child->nodeset);
+    }
 
-      /* Update allowed cpusets */
-      if (obj->allowed_cpuset) {
-	/* Update ours */
-	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, sys->allowed_cpuset);
+    if (hwloc_obj_type_is_memory(child->type)) {
+      /* update memory children cpusets in case some CPU-side parent was removed */
+      hwloc_bitmap_copy(child->cpuset, obj->cpuset);
+      hwloc_bitmap_copy(child->complete_cpuset, obj->complete_cpuset);
+    }
 
-	/* Update the given cpuset, but only what we know */
-	hwloc_bitmap_copy(mask, obj->cpuset);
-	hwloc_bitmap_not(mask, mask);
-	hwloc_bitmap_or(mask, mask, obj->allowed_cpuset);
-	hwloc_bitmap_and(sys->allowed_cpuset, sys->allowed_cpuset, mask);
-      } else {
-	/* Just take it as such */
-	obj->allowed_cpuset = hwloc_bitmap_dup(sys->allowed_cpuset);
-	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->cpuset);
-      }
+    fixup_sets(child);
+    child = child->next_sibling;
+  }
 
-      hwloc_bitmap_free(mask);
-    } else {
-      /* This object is the root of a machine */
-      sys = obj;
-      /* Apply complete_cpuset to cpuset and allowed_cpuset, it
-       * will automatically be applied below */
-      if (obj->complete_cpuset)
-        hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->complete_cpuset);
-      else
-        obj->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
-      if (obj->allowed_cpuset)
-        hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->complete_cpuset);
-      else
-        obj->allowed_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
-    }
+  /* switch to memory children list if any */
+  if (!in_memory_list && obj->memory_first_child) {
+    child = obj->memory_first_child;
+    in_memory_list = 1;
+    goto iterate;
   }
 
-  for_each_child_safe(child, obj, temp)
-    propagate_unused_cpuset(child, sys);
-  /* No PU under I/O or Misc */
+  /* No sets in I/O or Misc */
 }
 
 /* Setup object cpusets/nodesets by OR'ing its children. */
-HWLOC_DECLSPEC int
+int
 hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
 {
 #define ADD_OTHER_OBJ_SET(_dst, _src, _set)			\
@@ -1467,152 +2118,136 @@ hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
   }
   ADD_OTHER_OBJ_SET(dst, src, cpuset);
   ADD_OTHER_OBJ_SET(dst, src, complete_cpuset);
-  ADD_OTHER_OBJ_SET(dst, src, allowed_cpuset);
   ADD_OTHER_OBJ_SET(dst, src, nodeset);
   ADD_OTHER_OBJ_SET(dst, src, complete_nodeset);
-  ADD_OTHER_OBJ_SET(dst, src, allowed_nodeset);
   return 0;
 }
 
-HWLOC_DECLSPEC int
+int
 hwloc_obj_add_children_sets(hwloc_obj_t obj)
 {
   hwloc_obj_t child;
-  assert(obj->cpuset != NULL);
-  child = obj->first_child;
-  while (child) {
-    assert(child->cpuset != NULL);
+  for_each_child(child, obj) {
     hwloc_obj_add_other_obj_sets(obj, child);
-    child = child->next_sibling;
   }
   /* No need to look at Misc children, they contain no PU. */
   return 0;
 }
 
-/* Propagate nodesets up and down */
+/* CPU objects are inserted by cpusets, we know their cpusets are properly included.
+ * We just need fixup_sets() to make sure they aren't too wide.
+ *
+ * Within each memory hierarchy, nodeset are consistent as well.
+ * However they must be propagated to their CPU-side parents.
+ *
+ * A memory object nodeset consists of NUMA nodes below it.
+ * A normal object nodeset consists in NUMA nodes attached to any
+ * of its children or parents.
+ */
 static void
-propagate_nodeset(hwloc_obj_t obj, hwloc_obj_t sys)
-{
-  hwloc_obj_t child, *temp;
-  hwloc_bitmap_t parent_nodeset = NULL;
-  int parent_weight = 0;
-
-  if (!sys && obj->nodeset) {
-    sys = obj;
-    if (!obj->complete_nodeset)
-      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
-    if (!obj->allowed_nodeset)
-      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
-  }
-
-  if (sys) {
-    if (obj->nodeset) {
-      /* Some existing nodeset coming from above, to possibly propagate down */
-      parent_nodeset = obj->nodeset;
-      parent_weight = hwloc_bitmap_weight(parent_nodeset);
-    } else
-      obj->nodeset = hwloc_bitmap_alloc();
-  }
-
-  for_each_child_safe(child, obj, temp) {
-    /* Propagate singleton nodesets down */
-    if (parent_weight == 1) {
-      if (!child->nodeset)
-        child->nodeset = hwloc_bitmap_dup(obj->nodeset);
-      else if (!hwloc_bitmap_isequal(child->nodeset, parent_nodeset)) {
-        hwloc_debug_bitmap("Oops, parent nodeset %s", parent_nodeset);
-        hwloc_debug_bitmap(" is different from child nodeset %s, ignoring the child one\n", child->nodeset);
-        hwloc_bitmap_copy(child->nodeset, parent_nodeset);
-      }
-    }
+propagate_nodeset(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
 
-    /* Recurse */
-    propagate_nodeset(child, sys);
+  /* Start our nodeset from the parent one.
+   * It was emptied at root, and it's being filled with local nodes
+   * in that branch of the tree as we recurse down.
+   */
+  if (!obj->nodeset)
+    obj->nodeset = hwloc_bitmap_alloc();
+  if (obj->parent)
+    hwloc_bitmap_copy(obj->nodeset, obj->parent->nodeset);
+  else
+    hwloc_bitmap_zero(obj->nodeset);
+
+  /* Don't clear complete_nodeset, just make sure it contains nodeset.
+   * We cannot clear the complete_nodeset at root and rebuild it down because
+   * some bits may correspond to offline/disallowed NUMA nodes missing in the topology.
+   */
+  if (!obj->complete_nodeset)
+    obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+  else
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, obj->nodeset);
+
+  /* now add our local nodeset */
+  for_each_memory_child(child, obj) {
+    /* add memory children nodesets to ours */
+    hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, child->complete_nodeset);
+    /* no need to recurse because hwloc__attach_memory_object()
+     * makes sure nodesets are consistent within each memory hierarchy.
+     */
+  }
 
-    /* Propagate children nodesets up */
-    if (sys && child->nodeset)
-      hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+  /* Propagate our nodeset to CPU children. */
+  for_each_child(child, obj) {
+    propagate_nodeset(child);
   }
+
+  /* Propagate CPU children specific nodesets back to us.
+   *
+   * We cannot merge these two loops because we don't want to first child
+   * nodeset to be propagated back to us and then down to the second child.
+   * Each child may have its own local nodeset,
+   * each of them is propagated to us, but not to other children.
+   */
+  for_each_child(child, obj) {
+    hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+    hwloc_bitmap_or(obj->complete_nodeset, obj->complete_nodeset, child->complete_nodeset);
+  }
+
   /* No nodeset under I/O or Misc */
+
 }
 
-/* Propagate allowed and complete nodesets */
 static void
-propagate_nodesets(hwloc_obj_t obj)
-{
-  hwloc_bitmap_t mask = hwloc_bitmap_alloc();
-  hwloc_obj_t child, *temp;
-
-  for_each_child_safe(child, obj, temp) {
-    if (obj->nodeset) {
-      /* Update complete nodesets down */
-      if (child->complete_nodeset) {
-        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
-      } else if (child->nodeset) {
-        child->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
-        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, child->nodeset);
-      } /* else the child doesn't have nodeset information, we can not provide a complete nodeset */
-
-      /* Update allowed nodesets down */
-      if (child->allowed_nodeset) {
-        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, obj->allowed_nodeset);
-      } else if (child->nodeset) {
-        child->allowed_nodeset = hwloc_bitmap_dup(obj->allowed_nodeset);
-        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, child->nodeset);
-      }
-    }
-
-    propagate_nodesets(child);
+remove_unused_sets(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
 
-    if (obj->nodeset) {
-      /* Update allowed nodesets up */
-      if (child->nodeset && child->allowed_nodeset) {
-        hwloc_bitmap_copy(mask, child->nodeset);
-        hwloc_bitmap_andnot(mask, mask, child->allowed_nodeset);
-        hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, mask);
-      }
-    }
-  }
-  hwloc_bitmap_free(mask);
-  /* No nodeset under I/O or Misc */
+  hwloc_bitmap_and(obj->cpuset, obj->cpuset, topology->allowed_cpuset);
+  hwloc_bitmap_and(obj->nodeset, obj->nodeset, topology->allowed_nodeset);
 
-  if (obj->nodeset) {
-    /* Apply complete nodeset to nodeset and allowed_nodeset */
-    if (obj->complete_nodeset)
-      hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->complete_nodeset);
-    else
-      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
-    if (obj->allowed_nodeset)
-      hwloc_bitmap_and(obj->allowed_nodeset, obj->allowed_nodeset, obj->complete_nodeset);
-    else
-      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
-  }
+  for_each_child(child, obj)
+    remove_unused_sets(topology, child);
+  for_each_memory_child(child, obj)
+    remove_unused_sets(topology, child);
+  /* No cpuset under I/O or Misc */
 }
 
 static void
-remove_unused_sets(hwloc_obj_t obj)
+hwloc__filter_bridges(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
 {
-  hwloc_obj_t child, *temp;
+  hwloc_obj_t child, *pchild;
 
-  if (obj->cpuset) {
-    hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->allowed_cpuset);
-  }
-  if (obj->nodeset) {
-    hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->allowed_nodeset);
+  /* filter I/O children and recurse */
+  for_each_io_child_safe(child, root, pchild) {
+    enum hwloc_type_filter_e filter = topology->type_filter[child->type];
+
+    /* recurse into grand-children */
+    hwloc__filter_bridges(topology, child, depth+1);
+
+    child->attr->bridge.depth = depth;
+
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT
+	&& !child->io_first_child) {
+      unlink_and_free_single_object(pchild);
+      topology->modified = 1;
+    }
   }
-  if (obj->type == HWLOC_OBJ_NUMANODE && obj->os_index != (unsigned) -1 &&
-      !hwloc_bitmap_isset(obj->allowed_nodeset, obj->os_index)) {
-    unsigned i;
-    hwloc_debug("Dropping memory from disallowed node %u\n", obj->os_index);
-    obj->memory.local_memory = 0;
-    obj->memory.total_memory = 0;
-    for(i=0; i<obj->memory.page_types_len; i++)
-      obj->memory.page_types[i].count = 0;
+}
+
+static void
+hwloc_filter_bridges(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  hwloc_obj_t child = parent->first_child;
+  while (child) {
+    hwloc_filter_bridges(topology, child);
+    child = child->next_sibling;
   }
 
-  for_each_child_safe(child, obj, temp)
-    remove_unused_sets(child);
-  /* No cpuset under I/O or Misc */
+  hwloc__filter_bridges(topology, parent, 0);
 }
 
 void
@@ -1633,48 +2268,12 @@ hwloc__reorder_children(hwloc_obj_t parent)
     child->next_sibling = *prev;
     *prev = child;
   }
-  /* No ordering to enforce for Misc children. */
+  /* No ordering to enforce for Misc or I/O children. */
 }
 
-/* Remove objects that are ignored in any case.
- * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
- * Returns 0 otherwise.
- */
-static int
-ignore_type_always(hwloc_topology_t topology, hwloc_obj_t *pparent)
-{
-  hwloc_obj_t parent = *pparent, child, *pchild;
-  int dropped_children = 0;
-  int dropped = 0;
-
-  /* account dropped normal children only, others don't required reordering */
-  for_each_child_safe(child, parent, pchild)
-    dropped_children += ignore_type_always(topology, pchild);
-  for_each_io_child_safe(child, parent, pchild) /* There can be Misc under I/O */
-    ignore_type_always(topology, pchild);
-  for_each_misc_child_safe(child, parent, pchild)
-    ignore_type_always(topology, pchild);
-
-  if ((parent != topology->levels[0][0] &&
-       topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_ALWAYS)
-      || (parent->type == HWLOC_OBJ_CACHE && parent->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION
-	  && !(topology->flags & HWLOC_TOPOLOGY_FLAG_ICACHES))) {
-    hwloc_debug("%s", "\nDropping ignored object ");
-    hwloc_debug_print_object(0, parent);
-    unlink_and_free_single_object(pparent);
-    topology->modified = 1;
-    dropped = 1;
-
-  } else if (dropped_children) {
-    /* we keep this object but its children changed, reorder them by complete_cpuset */
-    hwloc__reorder_children(parent);
-  }
-
-  return dropped;
-}
-
-/* Remove all children whose cpuset is empty, except NUMA nodes
- * since we want to keep memory information, and except PCI bridges and devices.
+/* Remove all normal children whose cpuset is empty,
+ * and memory children whose nodeset is empty.
+ * Also don't remove objects that have I/O children, but ignore Misc.
  */
 static void
 remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
@@ -1683,209 +2282,274 @@ remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
 
   for_each_child_safe(child, obj, pchild)
     remove_empty(topology, pchild);
+  for_each_memory_child_safe(child, obj, pchild)
+    remove_empty(topology, pchild);
   /* No cpuset under I/O or Misc */
 
-  if (obj->type != HWLOC_OBJ_NUMANODE
-      && !obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
-      && !obj->io_first_child /* only remove if no I/O is attached there */
-      && hwloc_bitmap_iszero(obj->cpuset)) {
-    /* Remove empty children (even if it has Misc children) */
-    hwloc_debug("%s", "\nRemoving empty object ");
-    hwloc_debug_print_object(0, obj);
-    unlink_and_free_single_object(pobj);
-    topology->modified = 1;
+  if (obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
+      || obj->memory_first_child /* only remove if no memory attached there */
+      || obj->io_first_child /* only remove if no I/O is attached there */)
+    /* ignore Misc */
+    return;
+
+  if (hwloc__obj_type_is_normal(obj->type)) {
+    if (!hwloc_bitmap_iszero(obj->cpuset))
+      return;
+  } else {
+    assert(hwloc__obj_type_is_memory(obj->type));
+    if (!hwloc_bitmap_iszero(obj->nodeset))
+      return;
   }
+
+  hwloc_debug("%s", "\nRemoving empty object ");
+  hwloc_debug_print_object(0, obj);
+  unlink_and_free_single_object(pobj);
+  topology->modified = 1;
 }
 
-/* Remove objects that are ignored with keep structure flag.
- * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
- * Returns 0 otherwise.
- */
-static int
-ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_t *pparent)
+/* reset type depth before modifying levels (either reconnecting or filtering/keep_structure) */
+static void
+hwloc_reset_normal_type_depths(hwloc_topology_t topology)
 {
-  hwloc_obj_t parent = *pparent, child, *pchild;
-  int replacechild = 0, replaceparent = 0, droppedchildren = 0;
-
-  if (!parent->first_child) /* can't use arity yet */
-    /* There are no children, nothing to merge. */
-    return 0;
-
-  /* account dropped normal children only, others don't required reordering */
-  for_each_child_safe(child, parent, pchild)
-    droppedchildren += ignore_type_keep_structure(topology, pchild);
-  for_each_io_child_safe(child, parent, pchild)
-    ignore_type_keep_structure(topology, pchild);
-  for_each_misc_child_safe(child, parent, pchild)
-    ignore_type_keep_structure(topology, pchild);
-
-  if (droppedchildren)
-    hwloc__reorder_children(parent);
-
-  child = parent->first_child;
-  /* we don't merge if there are multiple "important" children. */
-  if (child->next_sibling) /* can't use arity yet */
-    return 0;
-
-  /* Check whether parent and/or child can be replaced */
-  if (topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
-    /* Parent can be ignored in favor of the child.  */
-    replaceparent = 1;
-  }
-  if (topology->ignored_types[child->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
-    /* Child can be ignored in favor of the parent.  */
-    replacechild = 1;
-  }
-
-  /* Decide which one to actually replace */
-  if (replaceparent && replacechild) {
-    /* If both may be replaced, look at obj_type_priority */
-    if (obj_type_priority[parent->type] > obj_type_priority[child->type])
-      replaceparent = 0;
-    else
-      replacechild = 0;
-  }
+  unsigned i;
+  for (i=HWLOC_OBJ_TYPE_MIN; i<=HWLOC_OBJ_GROUP; i++)
+    topology->type_depth[i] = HWLOC_TYPE_DEPTH_UNKNOWN;
+  /* type contiguity is asserted in topology_check() */
+  topology->type_depth[HWLOC_OBJ_DIE] = HWLOC_TYPE_DEPTH_UNKNOWN;
+}
 
-  if (replaceparent) {
-    /* Replace parent with child */
-    hwloc_debug("%s", "\nIgnoring parent ");
-    hwloc_debug_print_object(0, parent);
-    /* move children to child, so that unlink_and_free_single_object() doesn't move them to the grandparent */
-    if (parent->io_first_child) {
-      append_siblings_list(&child->io_first_child, parent->io_first_child, child);
-      parent->io_first_child = NULL;
-    }
-    if (parent->misc_first_child) {
-      append_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
-      parent->misc_first_child = NULL;
-    }
-    unlink_and_free_single_object(pparent);
-    topology->modified = 1;
+static int
+hwloc_dont_merge_group_level(hwloc_topology_t topology, unsigned i)
+{
+  unsigned j;
 
-  } else if (replacechild) {
-    /* Replace child with parent */
-    hwloc_debug("%s", "\nIgnoring child ");
-    hwloc_debug_print_object(0, child);
-    unlink_and_free_single_object(&parent->first_child);
-    topology->modified = 1;
-  }
+  /* Don't merge some groups in that level? */
+  for(j=0; j<topology->level_nbobjects[i]; j++)
+    if (topology->levels[i][j]->attr->group.dont_merge)
+      return 1;
 
-  return replaceparent ? 1 : 0;
+  return 0;
 }
 
-static void
-hwloc_drop_all_io(hwloc_topology_t topology, hwloc_obj_t root)
+/* compare i-th and i-1-th levels structure */
+static int
+hwloc_compare_levels_structure(hwloc_topology_t topology, unsigned i)
 {
-  hwloc_obj_t child, *pchild;
-  for_each_child_safe(child, root, pchild) {
-    hwloc_drop_all_io(topology, child);
-  }
-  for_each_io_child_safe(child, root, pchild) {
-    unlink_and_free_object_and_children(pchild);
-    topology->modified = 1;
+  int checkmemory = (topology->levels[i][0]->type == HWLOC_OBJ_PU);
+  unsigned j;
+
+  if (topology->level_nbobjects[i-1] != topology->level_nbobjects[i])
+    return -1;
+
+  for(j=0; j<topology->level_nbobjects[i]; j++) {
+    if (topology->levels[i-1][j] != topology->levels[i][j]->parent)
+      return -1;
+    if (topology->levels[i-1][j]->arity != 1)
+      return -1;
+    if (checkmemory && topology->levels[i-1][j]->memory_arity)
+      /* don't merge PUs if there's memory above */
+      return -1;
   }
-  /* No I/O under Misc */
+  /* same number of objects with arity 1 above, no problem */
+  return 0;
 }
 
-/*
- * If IO_DEVICES and WHOLE_IO are not set, we drop everything.
- * If WHOLE_IO is not set, we drop non-interesting devices,
- * and bridges that have no children.
- * If IO_BRIDGES is also not set, we also drop all bridges
- * except the hostbridges.
- */
+/* return > 0 if any level was removed, which means reconnect is needed */
 static void
-hwloc_drop_useless_io(hwloc_topology_t topology, hwloc_obj_t root)
+hwloc_filter_levels_keep_structure(hwloc_topology_t topology)
 {
-  hwloc_obj_t child, *pchild;
-
-  /* recurse into normal children */
-  for_each_child_safe(child, root, pchild) {
-    hwloc_drop_useless_io(topology, child);
-  }
-
-  /* filter I/O children and recurse */
-  for_each_io_child_safe(child, root, pchild) {
-    /* remove useless children if needed */
-    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_IO)
-	&& child->type == HWLOC_OBJ_PCI_DEVICE) {
-      unsigned classid = child->attr->pcidev.class_id;
-      unsigned baseclass = classid >> 8;
-      if (baseclass != 0x03 /* PCI_BASE_CLASS_DISPLAY */
-	  && baseclass != 0x02 /* PCI_BASE_CLASS_NETWORK */
-	  && baseclass != 0x01 /* PCI_BASE_CLASS_STORAGE */
-	  && baseclass != 0x0b /* PCI_BASE_CLASS_PROCESSOR */
-	  && classid != 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
-	  && baseclass != 0x12 /* Processing Accelerators */) {
-	unlink_and_free_object_and_children(pchild);
-	topology->modified = 1;
-	continue;
-      }
+  unsigned i, j;
+  int res = 0;
+
+  /* start from the bottom since we'll remove intermediate levels */
+  for(i=topology->nb_levels-1; i>0; i--) {
+    int replacechild = 0, replaceparent = 0;
+    hwloc_obj_t obj1 = topology->levels[i-1][0];
+    hwloc_obj_t obj2 = topology->levels[i][0];
+    hwloc_obj_type_t type1 = obj1->type;
+    hwloc_obj_type_t type2 = obj2->type;
+
+    /* Check whether parents and/or children can be replaced */
+    if (topology->type_filter[type1] == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* Parents can be ignored in favor of children.  */
+      replaceparent = 1;
+      if (type1 == HWLOC_OBJ_GROUP && hwloc_dont_merge_group_level(topology, i-1))
+	replaceparent = 0;
+    }
+    if (topology->type_filter[type2] == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* Children can be ignored in favor of parents.  */
+      replacechild = 1;
+      if (type1 == HWLOC_OBJ_GROUP && hwloc_dont_merge_group_level(topology, i))
+	replacechild = 0;
     }
-    /* recurse to ignore grand-children etc */
-    hwloc_drop_useless_io(topology, child);
-    /* now remove useless bridges if needed */
-    if (child->type == HWLOC_OBJ_BRIDGE) {
-      if (!child->io_first_child) {
-	/* bridges with no children are removed if WHOLE_IO isn't given */
-	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
-	  unlink_and_free_single_object(pchild);
-	  topology->modified = 1;
-	  continue;
+    if (!replacechild && !replaceparent)
+      /* no ignoring */
+      continue;
+    /* Decide which one to actually replace */
+    if (replaceparent && replacechild) {
+      /* If both may be replaced, look at obj_type_priority */
+      if (obj_type_priority[type1] >= obj_type_priority[type2])
+	replaceparent = 0;
+      else
+	replacechild = 0;
+    }
+    /* Are these levels actually identical? */
+    if (hwloc_compare_levels_structure(topology, i) < 0)
+      continue;
+    hwloc_debug("may merge levels #%u=%s and #%u=%s\n",
+		i-1, hwloc_obj_type_string(type1), i, hwloc_obj_type_string(type2));
+
+    /* OK, remove intermediate objects from the tree. */
+    for(j=0; j<topology->level_nbobjects[i]; j++) {
+      hwloc_obj_t parent = topology->levels[i-1][j];
+      hwloc_obj_t child = topology->levels[i][j];
+      unsigned k;
+      if (replacechild) {
+	/* move child's children to parent */
+	parent->first_child = child->first_child;
+	parent->last_child = child->last_child;
+	parent->arity = child->arity;
+	free(parent->children);
+	parent->children = child->children;
+	child->children = NULL;
+	/* update children parent */
+	for(k=0; k<parent->arity; k++)
+	  parent->children[k]->parent = parent;
+	/* append child memory/io/misc children to parent */
+	if (child->memory_first_child) {
+	  append_siblings_list(&parent->memory_first_child, child->memory_first_child, parent);
+	  parent->memory_arity += child->memory_arity;
+	}
+	if (child->io_first_child) {
+	  append_siblings_list(&parent->io_first_child, child->io_first_child, parent);
+	  parent->io_arity += child->io_arity;
+	}
+	if (child->misc_first_child) {
+	  append_siblings_list(&parent->misc_first_child, child->misc_first_child, parent);
+	  parent->misc_arity += child->misc_arity;
+	}
+	hwloc_free_unlinked_object(child);
+      } else {
+	/* replace parent with child in grand-parent */
+	if (parent->parent) {
+	  parent->parent->children[parent->sibling_rank] = child;
+	  child->sibling_rank = parent->sibling_rank;
+	  if (!parent->sibling_rank) {
+	    parent->parent->first_child = child;
+	    /* child->prev_sibling was already NULL, child was single */
+	  } else {
+	    child->prev_sibling = parent->parent->children[parent->sibling_rank-1];
+	    child->prev_sibling->next_sibling = child;
+	  }
+	  if (parent->sibling_rank == parent->parent->arity-1) {
+	    parent->parent->last_child = child;
+	    /* child->next_sibling was already NULL, child was single */
+	  } else {
+	    child->next_sibling = parent->parent->children[parent->sibling_rank+1];
+	    child->next_sibling->prev_sibling = child;
+	  }
+	  /* update child parent */
+	  child->parent = parent->parent;
+	} else {
+	  /* make child the new root */
+	  topology->levels[0][0] = child;
+	  child->parent = NULL;
+	}
+	/* prepend parent memory/io/misc children to child */
+	if (parent->memory_first_child) {
+	  prepend_siblings_list(&child->memory_first_child, parent->memory_first_child, child);
+	  child->memory_arity += parent->memory_arity;
+	}
+	if (parent->io_first_child) {
+	  prepend_siblings_list(&child->io_first_child, parent->io_first_child, child);
+	  child->io_arity += parent->io_arity;
 	}
-      } else if (child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_HOST) {
-	/* only hostbridges are kept if WHOLE_IO or IO_BRIDGE are not given */
-	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_BRIDGES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
-	  unlink_and_free_single_object(pchild);
-	  topology->modified = 1;
-	  continue;
+	if (parent->misc_first_child) {
+	  prepend_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
+	  child->misc_arity += parent->misc_arity;
 	}
+	hwloc_free_unlinked_object(parent);
+	/* prev/next_sibling will be updated below in another loop */
+      }
+    }
+    if (replaceparent && i>1) {
+      /* Update sibling list within modified parent->parent arrays */
+      for(j=0; j<topology->level_nbobjects[i]; j++) {
+	hwloc_obj_t child = topology->levels[i][j];
+	unsigned rank = child->sibling_rank;
+	child->prev_sibling = rank > 0 ? child->parent->children[rank-1] : NULL;
+	child->next_sibling = rank < child->parent->arity-1 ? child->parent->children[rank+1] : NULL;
       }
     }
-  }
 
-  /* No I/O under Misc */
-}
+    /* Update levels so that the next reconnect isn't confused */
+    if (replaceparent) {
+      /* Removing level i-1, so move levels [i..nb_levels-1] to [i-1..] */
+      free(topology->levels[i-1]);
+      memmove(&topology->levels[i-1],
+	      &topology->levels[i],
+	      (topology->nb_levels-i)*sizeof(topology->levels[i]));
+      memmove(&topology->level_nbobjects[i-1],
+	      &topology->level_nbobjects[i],
+	      (topology->nb_levels-i)*sizeof(topology->level_nbobjects[i]));
+      hwloc_debug("removed parent level %s at depth %u\n",
+		  hwloc_obj_type_string(type1), i-1);
+    } else {
+      /* Removing level i, so move levels [i+1..nb_levels-1] and later to [i..] */
+      free(topology->levels[i]);
+      memmove(&topology->levels[i],
+	      &topology->levels[i+1],
+	      (topology->nb_levels-1-i)*sizeof(topology->levels[i]));
+      memmove(&topology->level_nbobjects[i],
+	      &topology->level_nbobjects[i+1],
+	      (topology->nb_levels-1-i)*sizeof(topology->level_nbobjects[i]));
+      hwloc_debug("removed child level %s at depth %u\n",
+		  hwloc_obj_type_string(type2), i);
+    }
+    topology->level_nbobjects[topology->nb_levels-1] = 0;
+    topology->levels[topology->nb_levels-1] = NULL;
+    topology->nb_levels--;
 
-static void
-hwloc_propagate_bridge_depth(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
-{
-  hwloc_obj_t child;
-  for(child = root->first_child; child; child = child->next_sibling) {
-    assert(!depth); /* no normal children under I/O */
-    hwloc_propagate_bridge_depth(topology, child, 0);
+    res++;
   }
-  for(child = root->io_first_child; child; child = child->next_sibling) {
-    if (child->type == HWLOC_OBJ_BRIDGE) {
-      child->attr->bridge.depth = depth;
-      hwloc_propagate_bridge_depth(topology, child, depth+1);
-    } else if (!hwloc_obj_type_is_io(child->type)) {
-      hwloc_propagate_bridge_depth(topology, child, 0);
+
+  if (res > 0) {
+    /* Update object and type depths if some levels were removed */
+    hwloc_reset_normal_type_depths(topology);
+    for(i=0; i<topology->nb_levels; i++) {
+      hwloc_obj_type_t type = topology->levels[i][0]->type;
+      for(j=0; j<topology->level_nbobjects[i]; j++)
+	topology->levels[i][j]->depth = (int)i;
+      if (topology->type_depth[type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+	topology->type_depth[type] = (int)i;
+      else
+	topology->type_depth[type] = HWLOC_TYPE_DEPTH_MULTIPLE;
     }
   }
-  /* No I/O under Misc children */
 }
 
 static void
 hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
 {
-  hwloc_obj_t child, *array;
+  hwloc_obj_t child;
+  unsigned arity = root->arity;
+  hwloc_obj_t *array;
   int ok;
 
   /* assume we're not symmetric by default */
   root->symmetric_subtree = 0;
 
   /* if no child, we are symmetric */
-  if (!root->arity) {
-    root->symmetric_subtree = 1;
-    return;
-  }
+  if (!arity)
+    goto good;
+
+  /* FIXME ignore memory just like I/O and Misc? */
 
   /* look at normal children only, I/O and Misc are ignored.
    * return if any child is not symmetric.
    */
   ok = 1;
-  for(child = root->first_child; child; child = child->next_sibling) {
+  for_each_child(child, root) {
     hwloc_propagate_symmetric_subtree(topology, child);
     if (!child->symmetric_subtree)
       ok = 0;
@@ -1894,33 +2558,52 @@ hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
     return;
   /* Misc and I/O children do not care about symmetric_subtree */
 
+  /* if single child is symmetric, we're good */
+  if (arity == 1)
+    goto good;
+
   /* now check that children subtrees are identical.
    * just walk down the first child in each tree and compare their depth and arities
    */
-  array = malloc(root->arity * sizeof(*array));
-  memcpy(array, root->children, root->arity * sizeof(*array));
+  array = malloc(arity * sizeof(*array));
+  if (!array)
+    return;
+  memcpy(array, root->children, arity * sizeof(*array));
   while (1) {
     unsigned i;
     /* check current level arities and depth */
-    for(i=1; i<root->arity; i++)
+    for(i=1; i<arity; i++)
       if (array[i]->depth != array[0]->depth
 	  || array[i]->arity != array[0]->arity) {
-      free(array);
-      return;
-    }
+	free(array);
+	return;
+      }
     if (!array[0]->arity)
       /* no more children level, we're ok */
       break;
     /* look at first child of each element now */
-    for(i=0; i<root->arity; i++)
+    for(i=0; i<arity; i++)
       array[i] = array[i]->first_child;
   }
   free(array);
 
   /* everything went fine, we're symmetric */
+ good:
   root->symmetric_subtree = 1;
 }
 
+static void hwloc_set_group_depth(hwloc_topology_t topology)
+{
+  unsigned groupdepth = 0;
+  unsigned i, j;
+  for(i=0; i<topology->nb_levels; i++)
+    if (topology->levels[i][0]->type == HWLOC_OBJ_GROUP) {
+      for (j = 0; j < topology->level_nbobjects[i]; j++)
+	topology->levels[i][j]->attr->group.depth = groupdepth;
+      groupdepth++;
+    }
+}
+
 /*
  * Initialize handy pointers in the whole topology.
  * The topology only had first_child and next_sibling pointers.
@@ -1930,7 +2613,7 @@ hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
  *
  * Can be called several times, so may have to update the array.
  */
-void
+static void
 hwloc_connect_children(hwloc_obj_t parent)
 {
   unsigned n, oldn = parent->arity;
@@ -1958,11 +2641,11 @@ hwloc_connect_children(hwloc_obj_t parent)
     /* no need for an array anymore */
     free(parent->children);
     parent->children = NULL;
-    goto io;
+    goto memory;
   }
   if (ok)
     /* array is already OK (even if too large) */
-    goto io;
+    goto memory;
 
   /* alloc a larger array if needed */
   if (oldn < n) {
@@ -1976,8 +2659,23 @@ hwloc_connect_children(hwloc_obj_t parent)
     parent->children[n] = child;
   }
 
-  /* Misc children list */
- io:
+
+
+ memory:
+  /* Memory children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->memory_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->memory_arity = n;
+
+  /* I/O children list */
 
   prev_child = NULL;
   for (n = 0, child = parent->io_first_child;
@@ -2005,65 +2703,31 @@ hwloc_connect_children(hwloc_obj_t parent)
 }
 
 /*
- * Check whether there is an object below ROOT that has the same type as OBJ
+ * Check whether there is an object strictly below ROOT that has the same type as OBJ
  */
 static int
 find_same_type(hwloc_obj_t root, hwloc_obj_t obj)
 {
   hwloc_obj_t child;
 
-  if (hwloc_type_cmp(root, obj) == HWLOC_TYPE_EQUAL)
-    return 1;
-
-  for (child = root->first_child; child; child = child->next_sibling)
+  for_each_child (child, root) {
+    if (hwloc_type_cmp(child, obj) == HWLOC_OBJ_EQUAL)
+      return 1;
     if (find_same_type(child, obj))
       return 1;
+  }
 
   return 0;
 }
 
-/* traverse the array of current object and compare them with top_obj.
- * if equal, take the object and put its children into the remaining objs.
- * if not equal, put the object into the remaining objs.
- */
 static int
-hwloc_level_take_objects(hwloc_obj_t top_obj,
-			 hwloc_obj_t *current_objs, unsigned n_current_objs,
-			 hwloc_obj_t *taken_objs, unsigned n_taken_objs __hwloc_attribute_unused,
-			 hwloc_obj_t *remaining_objs, unsigned n_remaining_objs __hwloc_attribute_unused)
-{
-  unsigned taken_i = 0;
-  unsigned new_i = 0;
-  unsigned i, j;
-
-  for (i = 0; i < n_current_objs; i++)
-    if (hwloc_type_cmp(top_obj, current_objs[i]) == HWLOC_TYPE_EQUAL) {
-      /* Take it, add main children.  */
-      taken_objs[taken_i++] = current_objs[i];
-      for (j = 0; j < current_objs[i]->arity; j++)
-	remaining_objs[new_i++] = current_objs[i]->children[j];
-    } else {
-      /* Leave it.  */
-      remaining_objs[new_i++] = current_objs[i];
-    }
-
-#ifdef HWLOC_DEBUG
-  /* Make sure we didn't mess up.  */
-  assert(taken_i == n_taken_objs);
-  assert(new_i == n_current_objs - n_taken_objs + n_remaining_objs);
-#endif
-
-  return new_i;
-}
-
-static unsigned
-hwloc_build_level_from_list(struct hwloc_obj *first, struct hwloc_obj ***levelp)
+hwloc_build_level_from_list(struct hwloc_special_level_s *slevel)
 {
   unsigned i, nb;
   struct hwloc_obj * obj;
 
   /* count */
-  obj = first;
+  obj = slevel->first;
   i = 0;
   while (obj) {
     i++;
@@ -2071,145 +2735,139 @@ hwloc_build_level_from_list(struct hwloc_obj *first, struct hwloc_obj ***levelp)
   }
   nb = i;
 
-  /* allocate and fill level */
-  *levelp = malloc(nb * sizeof(struct hwloc_obj *));
-  obj = first;
-  i = 0;
-  while (obj) {
-    obj->logical_index = i;
-    (*levelp)[i] = obj;
-    i++;
-    obj = obj->next_cousin;
+  if (nb) {
+    /* allocate and fill level */
+    slevel->objs = malloc(nb * sizeof(struct hwloc_obj *));
+    if (!slevel->objs)
+      return -1;
+
+    obj = slevel->first;
+    i = 0;
+    while (obj) {
+      obj->logical_index = i;
+      slevel->objs[i] = obj;
+      i++;
+      obj = obj->next_cousin;
+    }
   }
 
-  return nb;
+  slevel->nbobjs = nb;
+  return 0;
+}
+
+static void
+hwloc_append_special_object(struct hwloc_special_level_s *level, hwloc_obj_t obj)
+{
+  if (level->first) {
+    obj->prev_cousin = level->last;
+    obj->prev_cousin->next_cousin = obj;
+    level->last = obj;
+  } else {
+    obj->prev_cousin = NULL;
+    level->first = level->last = obj;
+  }
 }
 
-/* Append I/O objects to their lists */
+/* Append special objects to their lists */
 static void
-hwloc_list_io_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+hwloc_list_special_objects(hwloc_topology_t topology, hwloc_obj_t obj)
 {
-  hwloc_obj_t child, *temp;
+  hwloc_obj_t child;
 
-  if (hwloc_obj_type_is_io(obj->type)) {
-    /* make sure we don't have remaining stale pointers from a previous load */
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    obj->next_cousin = NULL;
+    obj->depth = HWLOC_TYPE_DEPTH_NUMANODE;
+    /* Insert the main NUMA node list */
+    hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_NUMANODE], obj);
+
+    /* Recurse, NUMA nodes only have Misc children */
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else if (obj->type == HWLOC_OBJ_MEMCACHE) {
+    obj->next_cousin = NULL;
+    obj->depth = HWLOC_TYPE_DEPTH_MEMCACHE;
+    /* Insert the main MemCache list */
+    hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_MEMCACHE], obj);
+
+    /* Recurse, MemCaches have NUMA nodes or Misc children */
+    for_each_memory_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else if (obj->type == HWLOC_OBJ_MISC) {
+    obj->next_cousin = NULL;
+    obj->depth = HWLOC_TYPE_DEPTH_MISC;
+    /* Insert the main Misc list */
+    hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_MISC], obj);
+    /* Recurse, Misc only have Misc children */
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else if (hwloc__obj_type_is_io(obj->type)) {
     obj->next_cousin = NULL;
-    obj->prev_cousin = NULL;
 
     if (obj->type == HWLOC_OBJ_BRIDGE) {
       obj->depth = HWLOC_TYPE_DEPTH_BRIDGE;
       /* Insert in the main bridge list */
-      if (topology->first_bridge) {
-	obj->prev_cousin = topology->last_bridge;
-	obj->prev_cousin->next_cousin = obj;
-	topology->last_bridge = obj;
-      } else {
-	topology->first_bridge = topology->last_bridge = obj;
-      }
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_BRIDGE], obj);
 
     } else if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
       obj->depth = HWLOC_TYPE_DEPTH_PCI_DEVICE;
       /* Insert in the main pcidev list */
-      if (topology->first_pcidev) {
-	obj->prev_cousin = topology->last_pcidev;
-	obj->prev_cousin->next_cousin = obj;
-	topology->last_pcidev = obj;
-      } else {
-	topology->first_pcidev = topology->last_pcidev = obj;
-      }
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_PCIDEV], obj);
 
     } else if (obj->type == HWLOC_OBJ_OS_DEVICE) {
       obj->depth = HWLOC_TYPE_DEPTH_OS_DEVICE;
       /* Insert in the main osdev list */
-      if (topology->first_osdev) {
-	obj->prev_cousin = topology->last_osdev;
-	obj->prev_cousin->next_cousin = obj;
-	topology->last_osdev = obj;
-      } else {
-	topology->first_osdev = topology->last_osdev = obj;
-      }
+      hwloc_append_special_object(&topology->slevels[HWLOC_SLEVEL_OSDEV], obj);
     }
-  }
 
-  for_each_child_safe(child, obj, temp)
-    hwloc_list_io_objects(topology, child);
-  for_each_io_child_safe(child, obj, temp)
-    hwloc_list_io_objects(topology, child);
-  /* No I/O under Misc */
+    /* Recurse, I/O only have I/O and Misc children */
+    for_each_io_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+
+  } else {
+    /* Recurse */
+    for_each_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_memory_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_io_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+    for_each_misc_child(child, obj)
+      hwloc_list_special_objects(topology, child);
+  }
 }
 
 /* Build I/O levels */
-static void
-hwloc_connect_io_levels(hwloc_topology_t topology)
+static int
+hwloc_connect_io_misc_levels(hwloc_topology_t topology)
 {
-  free(topology->bridge_level);
-  topology->bridge_level = NULL;
-  topology->bridge_nbobjects = 0;
-  topology->first_bridge = topology->last_bridge = NULL;
-  topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
-
-  free(topology->pcidev_level);
-  topology->pcidev_level = NULL;
-  topology->pcidev_nbobjects = 0;
-  topology->first_pcidev = topology->last_pcidev = NULL;
-  topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
-
-  free(topology->osdev_level);
-  topology->osdev_level = NULL;
-  topology->osdev_nbobjects = 0;
-  topology->first_osdev = topology->last_osdev = NULL;
-  topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+  unsigned i;
 
-  hwloc_list_io_objects(topology, topology->levels[0][0]);
-  topology->bridge_nbobjects = hwloc_build_level_from_list(topology->first_bridge, &topology->bridge_level);
-  topology->pcidev_nbobjects = hwloc_build_level_from_list(topology->first_pcidev, &topology->pcidev_level);
-  topology->osdev_nbobjects = hwloc_build_level_from_list(topology->first_osdev, &topology->osdev_level);
-}
+  for(i=0; i<HWLOC_NR_SLEVELS; i++)
+    free(topology->slevels[i].objs);
+  memset(&topology->slevels, 0, sizeof(topology->slevels));
 
-/* Append Misc object to their list */
-static void
-hwloc_list_misc_objects(hwloc_topology_t topology, hwloc_obj_t obj)
-{
-  hwloc_obj_t child, *temp;
+  hwloc_list_special_objects(topology, topology->levels[0][0]);
 
-  if (obj->type == HWLOC_OBJ_MISC) {
-    obj->depth = HWLOC_TYPE_DEPTH_MISC;
-    /* Insert the main Misc list */
-    if (topology->first_misc) {
-      obj->prev_cousin = topology->last_misc;
-      obj->prev_cousin->next_cousin = obj;
-      topology->last_misc = obj;
-    } else {
-      topology->first_misc = topology->last_misc = obj;
-    }
+  for(i=0; i<HWLOC_NR_SLEVELS; i++) {
+    if (hwloc_build_level_from_list(&topology->slevels[i]) < 0)
+      return -1;
   }
 
-  for_each_child_safe(child, obj, temp)
-    hwloc_list_misc_objects(topology, child);
-  for_each_io_child_safe(child, obj, temp)
-    hwloc_list_misc_objects(topology, child);
-  for_each_misc_child_safe(child, obj, temp)
-    hwloc_list_misc_objects(topology, child);
-}
-
-/* Build Misc level */
-static void
-hwloc_connect_misc_level(hwloc_topology_t topology)
-{
-  free(topology->misc_level);
-  topology->misc_level = NULL;
-  topology->misc_nbobjects = 0;
-  topology->first_misc = topology->last_misc = NULL;
-  topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
-
-  hwloc_list_misc_objects(topology, topology->levels[0][0]);
-  topology->misc_nbobjects = hwloc_build_level_from_list(topology->first_misc, &topology->misc_level);
+  return 0;
 }
 
 /*
  * Do the remaining work that hwloc_connect_children() did not do earlier.
+ * Requires object arity and children list to be properly initialized (by hwloc_connect_children()).
  */
-int
+static int
 hwloc_connect_levels(hwloc_topology_t topology)
 {
   unsigned l, i=0;
@@ -2217,16 +2875,14 @@ hwloc_connect_levels(hwloc_topology_t topology)
   unsigned n_objs, n_taken_objs, n_new_objs;
 
   /* reset non-root levels (root was initialized during init and will not change here) */
-  for(l=1; l<HWLOC_DEPTH_MAX; l++)
+  for(l=1; l<topology->nb_levels; l++)
     free(topology->levels[l]);
-  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
-  memset(topology->level_nbobjects+1, 0,  (HWLOC_DEPTH_MAX-1)*sizeof(*topology->level_nbobjects));
+  memset(topology->levels+1, 0, (topology->nb_levels-1)*sizeof(*topology->levels));
+  memset(topology->level_nbobjects+1, 0, (topology->nb_levels-1)*sizeof(*topology->level_nbobjects));
   topology->nb_levels = 1;
-  /* don't touch next_group_depth, the Group objects are still here */
 
-  /* initialize all depth to unknown */
-  for (l = HWLOC_OBJ_SYSTEM; l < HWLOC_OBJ_TYPE_MAX; l++)
-    topology->type_depth[l] = HWLOC_TYPE_DEPTH_UNKNOWN;
+  /* initialize all non-IO/non-Misc depths to unknown */
+  hwloc_reset_normal_type_depths(topology);
 
   /* initialize root type depth */
   root = topology->levels[0][0];
@@ -2267,7 +2923,7 @@ hwloc_connect_levels(hwloc_topology_t topology)
 
     /* See if this is actually the topmost object */
     for (i = 0; i < n_objs; i++) {
-      if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_TYPE_EQUAL) {
+      if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_OBJ_EQUAL) {
 	if (find_same_type(objs[i], top_obj)) {
 	  /* OBJS[i] is strictly above an object of the same type as TOP_OBJ, so it
 	   * is above TOP_OBJ.  */
@@ -2279,36 +2935,52 @@ hwloc_connect_levels(hwloc_topology_t topology)
     /* Now peek all objects of the same type, build a level with that and
      * replace them with their children.  */
 
-    /* First count them.  */
-    n_taken_objs = 0;
+    /* allocate enough to take all current objects and an ending NULL */
+    taken_objs = malloc((n_objs+1) * sizeof(taken_objs[0]));
+    if (!taken_objs) {
+      free(objs);
+      errno = ENOMEM;
+      return -1;
+    }
+
+    /* allocate enough to keep all current objects or their children */
+    n_new_objs = 0;
+    for (i = 0; i < n_objs; i++) {
+      if (objs[i]->arity)
+	n_new_objs += objs[i]->arity;
+      else
+	n_new_objs++;
+    }
+    new_objs = malloc(n_new_objs * sizeof(new_objs[0]));
+    if (!new_objs) {
+      free(objs);
+      free(taken_objs);
+      errno = ENOMEM;
+      return -1;
+    }
+
+    /* now actually take these objects */
     n_new_objs = 0;
+    n_taken_objs = 0;
     for (i = 0; i < n_objs; i++)
-      if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_TYPE_EQUAL) {
-	n_taken_objs++;
+      if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_OBJ_EQUAL) {
+	/* Take it, add main children.  */
+	taken_objs[n_taken_objs++] = objs[i];
+	memcpy(&new_objs[n_new_objs], objs[i]->children, objs[i]->arity * sizeof(new_objs[0]));
 	n_new_objs += objs[i]->arity;
+      } else {
+	/* Leave it.  */
+	new_objs[n_new_objs++] = objs[i];
       }
 
-    /* New level.  */
-    taken_objs = malloc((n_taken_objs + 1) * sizeof(taken_objs[0]));
-    /* New list of pending objects.  */
-    if (n_objs - n_taken_objs + n_new_objs) {
-      new_objs = malloc((n_objs - n_taken_objs + n_new_objs) * sizeof(new_objs[0]));
-    } else {
-#ifdef HWLOC_DEBUG
-      assert(!n_new_objs);
-      assert(n_objs == n_taken_objs);
-#endif
+    if (!n_new_objs) {
+      free(new_objs);
       new_objs = NULL;
     }
 
-    n_new_objs = hwloc_level_take_objects(top_obj,
-					  objs, n_objs,
-					  taken_objs, n_taken_objs,
-					  new_objs, n_new_objs);
-
     /* Ok, put numbers in the level and link cousins.  */
     for (i = 0; i < n_taken_objs; i++) {
-      taken_objs[i]->depth = topology->nb_levels;
+      taken_objs[i]->depth = (int) topology->nb_levels;
       taken_objs[i]->logical_index = i;
       if (i) {
 	taken_objs[i]->prev_cousin = taken_objs[i-1];
@@ -2319,19 +2991,48 @@ hwloc_connect_levels(hwloc_topology_t topology)
     taken_objs[n_taken_objs-1]->next_cousin = NULL;
 
     /* One more level!  */
-    if (top_obj->type == HWLOC_OBJ_CACHE)
-      hwloc_debug("--- Cache level depth %u", top_obj->attr->cache.depth);
-    else
-      hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
+    hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
     hwloc_debug(" has number %u\n\n", topology->nb_levels);
 
     if (topology->type_depth[top_obj->type] == HWLOC_TYPE_DEPTH_UNKNOWN)
-      topology->type_depth[top_obj->type] = topology->nb_levels;
+      topology->type_depth[top_obj->type] = (int) topology->nb_levels;
     else
       topology->type_depth[top_obj->type] = HWLOC_TYPE_DEPTH_MULTIPLE; /* mark as unknown */
 
     taken_objs[n_taken_objs] = NULL;
 
+    if (topology->nb_levels == topology->nb_levels_allocated) {
+      /* extend the arrays of levels */
+      void *tmplevels, *tmpnbobjs;
+      tmplevels = realloc(topology->levels,
+			  2 * topology->nb_levels_allocated * sizeof(*topology->levels));
+      tmpnbobjs = realloc(topology->level_nbobjects,
+			  2 * topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
+      if (!tmplevels || !tmpnbobjs) {
+	fprintf(stderr, "hwloc failed to realloc level arrays to %u\n", topology->nb_levels_allocated * 2);
+
+	/* if one realloc succeeded, make sure the caller will free the new buffer */
+	if (tmplevels)
+	  topology->levels = tmplevels;
+	if (tmpnbobjs)
+	  topology->level_nbobjects = tmpnbobjs;
+	/* the realloc that failed left topology->level_foo untouched, will be freed by the caller */
+
+	free(objs);
+	free(taken_objs);
+	free(new_objs);
+	errno = ENOMEM;
+	return -1;
+      }
+      topology->levels = tmplevels;
+      topology->level_nbobjects = tmpnbobjs;
+      memset(topology->levels + topology->nb_levels_allocated,
+	     0, topology->nb_levels_allocated * sizeof(*topology->levels));
+      memset(topology->level_nbobjects + topology->nb_levels_allocated,
+	     0, topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
+      topology->nb_levels_allocated *= 2;
+    }
+    /* add the new level */
     topology->level_nbobjects[topology->nb_levels] = n_taken_objs;
     topology->levels[topology->nb_levels] = taken_objs;
 
@@ -2345,43 +3046,142 @@ hwloc_connect_levels(hwloc_topology_t topology)
   }
 
   /* It's empty now.  */
-  if (objs)
-    free(objs);
+  free(objs);
 
-  hwloc_connect_io_levels(topology);
-  hwloc_connect_misc_level(topology);
+  return 0;
+}
 
-  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+int
+hwloc_topology_reconnect(struct hwloc_topology *topology, unsigned long flags)
+{
+  if (flags) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (!topology->modified)
+    return 0;
+
+  hwloc_connect_children(topology->levels[0][0]);
+
+  if (hwloc_connect_levels(topology) < 0)
+    return -1;
+
+  if (hwloc_connect_io_misc_levels(topology) < 0)
+    return -1;
+
+  topology->modified = 0;
 
   return 0;
 }
 
-void hwloc_alloc_obj_cpusets(hwloc_obj_t obj)
+/* for regression testing, make sure the order of io devices
+ * doesn't change with the dentry order in the filesystem
+ *
+ * Only needed for OSDev for now.
+ */
+static hwloc_obj_t
+hwloc_debug_insert_osdev_sorted(hwloc_obj_t queue, hwloc_obj_t obj)
 {
-  if (!obj->cpuset)
-    obj->cpuset = hwloc_bitmap_alloc_full();
-  if (!obj->complete_cpuset)
-    obj->complete_cpuset = hwloc_bitmap_alloc();
-  if (!obj->allowed_cpuset)
-    obj->allowed_cpuset = hwloc_bitmap_alloc_full();
-  if (!obj->nodeset)
-    obj->nodeset = hwloc_bitmap_alloc();
-  if (!obj->complete_nodeset)
-    obj->complete_nodeset = hwloc_bitmap_alloc();
-  if (!obj->allowed_nodeset)
-    obj->allowed_nodeset = hwloc_bitmap_alloc_full();
+  hwloc_obj_t *pcur = &queue;
+  while (*pcur && strcmp((*pcur)->name, obj->name) < 0)
+    pcur = &((*pcur)->next_sibling);
+  obj->next_sibling = *pcur;
+  *pcur = obj;
+  return queue;
+}
+
+static void
+hwloc_debug_sort_children(hwloc_obj_t root)
+{
+  hwloc_obj_t child;
+
+  if (root->io_first_child) {
+    hwloc_obj_t osdevqueue, *pchild;
+
+    pchild = &root->io_first_child;
+    osdevqueue = NULL;
+    while ((child = *pchild) != NULL) {
+      if (child->type != HWLOC_OBJ_OS_DEVICE) {
+	/* keep non-osdev untouched */
+	pchild = &child->next_sibling;
+	continue;
+      }
+
+      /* dequeue this child */
+      *pchild = child->next_sibling;
+      child->next_sibling = NULL;
+
+      /* insert in osdev queue in order */
+      osdevqueue = hwloc_debug_insert_osdev_sorted(osdevqueue, child);
+    }
+
+    /* requeue the now-sorted osdev queue */
+    *pchild = osdevqueue;
+  }
+
+  /* Recurse */
+  for_each_child(child, root)
+    hwloc_debug_sort_children(child);
+  for_each_memory_child(child, root)
+    hwloc_debug_sort_children(child);
+  for_each_io_child(child, root)
+    hwloc_debug_sort_children(child);
+  /* no I/O under Misc */
+}
+
+void hwloc_alloc_root_sets(hwloc_obj_t root)
+{
+  /*
+   * All sets are initially NULL.
+   *
+   * At least one backend should call this function to initialize all sets at once.
+   * XML uses it lazily in case only some sets were given in the XML import.
+   *
+   * Other backends can check root->cpuset != NULL to see if somebody
+   * discovered things before them.
+   */
+  if (!root->cpuset)
+     root->cpuset = hwloc_bitmap_alloc();
+  if (!root->complete_cpuset)
+     root->complete_cpuset = hwloc_bitmap_alloc();
+  if (!root->nodeset)
+    root->nodeset = hwloc_bitmap_alloc();
+  if (!root->complete_nodeset)
+    root->complete_nodeset = hwloc_bitmap_alloc();
+}
+
+static void
+hwloc_discover_by_phase(struct hwloc_topology *topology,
+			struct hwloc_disc_status *dstatus,
+			const char *phasename __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  hwloc_debug("%s phase discovery...\n", phasename);
+  for(backend = topology->backends; backend; backend = backend->next) {
+    if (dstatus->phase & dstatus->excluded_phases)
+      break;
+    if (!(backend->phases & dstatus->phase))
+      continue;
+    if (!backend->discover)
+      continue;
+    hwloc_debug("%s phase discovery in component %s...\n", phasename, backend->component->name);
+    backend->discover(backend, dstatus);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+  }
 }
 
 /* Main discovery loop */
 static int
-hwloc_discover(struct hwloc_topology *topology)
+hwloc_discover(struct hwloc_topology *topology,
+	       struct hwloc_disc_status *dstatus)
 {
-  struct hwloc_backend *backend;
-  int gotsomeio = 0;
-  unsigned discoveries = 0;
+  const char *env;
 
   topology->modified = 0; /* no need to reconnect yet */
 
+  topology->allowed_cpuset = hwloc_bitmap_alloc_full();
+  topology->allowed_nodeset = hwloc_bitmap_alloc_full();
+
   /* discover() callbacks should use hwloc_insert to add objects initialized
    * through hwloc_alloc_setup_object.
    * For node levels, nodeset and memory must be initialized.
@@ -2417,202 +3217,221 @@ hwloc_discover(struct hwloc_topology *topology)
    * automatically propagated to the whole tree after detection.
    */
 
-  /*
-   * Discover CPUs first
-   */
-  backend = topology->backends;
-  while (NULL != backend) {
-    int err;
-    if (backend->component->type != HWLOC_DISC_COMPONENT_TYPE_CPU
-	&& backend->component->type != HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
-      /* not yet */
-      goto next_cpubackend;
-    if (!backend->discover)
-      goto next_cpubackend;
-
-    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
-      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
-      hwloc_connect_children(topology->levels[0][0]);
-      if (hwloc_connect_levels(topology) < 0)
-	return -1;
-      topology->modified = 0;
-    }
+  if (topology->backend_phases & HWLOC_DISC_PHASE_GLOBAL) {
+    /* usually, GLOBAL is alone.
+     * but HWLOC_ANNOTATE_GLOBAL_COMPONENTS=1 allows optional ANNOTATE steps.
+     */
+    struct hwloc_backend *global_backend = topology->backends;
+    assert(global_backend);
+    assert(global_backend->phases == HWLOC_DISC_PHASE_GLOBAL);
 
-    err = backend->discover(backend);
-    if (err >= 0) {
-      if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
-        gotsomeio += err;
-      discoveries++;
-    }
+    /*
+     * Perform the single-component-based GLOBAL discovery
+     */
+    hwloc_debug("GLOBAL phase discovery...\n");
+    hwloc_debug("GLOBAL phase discovery with component %s...\n", global_backend->component->name);
+    dstatus->phase = HWLOC_DISC_PHASE_GLOBAL;
+    global_backend->discover(global_backend, dstatus);
     hwloc_debug_print_objects(0, topology->levels[0][0]);
+  }
+  /* Don't explicitly ignore other phases, in case there's ever
+   * a need to bring them back.
+   * The component with usually exclude them by default anyway.
+   * Except if annotating global components is explicitly requested.
+   */
+
+  if (topology->backend_phases & HWLOC_DISC_PHASE_CPU) {
+    /*
+     * Discover CPUs first
+     */
+    dstatus->phase = HWLOC_DISC_PHASE_CPU;
+    hwloc_discover_by_phase(topology, dstatus, "CPU");
+  }
 
-next_cpubackend:
-    backend = backend->next;
+  if (!(topology->backend_phases & (HWLOC_DISC_PHASE_GLOBAL|HWLOC_DISC_PHASE_CPU))) {
+    hwloc_debug("No GLOBAL or CPU component phase found\n");
+    /* we'll fail below */
   }
 
-  if (!discoveries) {
-    hwloc_debug("%s", "No CPU backend enabled or no discovery succeeded\n");
+  /* One backend should have called hwloc_alloc_root_sets()
+   * and set bits during PU and NUMA insert.
+   */
+  if (!topology->levels[0][0]->cpuset || hwloc_bitmap_iszero(topology->levels[0][0]->cpuset)) {
+    hwloc_debug("%s", "No PU added by any CPU or GLOBAL component phase\n");
     errno = EINVAL;
     return -1;
   }
 
-  /* Update objects cpusets and nodesets now that the CPU/GLOBAL backend populated PUs and nodes */
-
-  hwloc_debug("%s", "\nRestrict topology cpusets to existing PU and NODE objects\n");
-  collect_proc_cpuset(topology->levels[0][0], NULL);
+  /*
+   * Memory-specific discovery
+   */
+  if (topology->backend_phases & HWLOC_DISC_PHASE_MEMORY) {
+    dstatus->phase = HWLOC_DISC_PHASE_MEMORY;
+    hwloc_discover_by_phase(topology, dstatus, "MEMORY");
+  }
 
-  hwloc_debug("%s", "\nPropagate disallowed cpus down and up\n");
-  propagate_unused_cpuset(topology->levels[0][0], NULL);
+  if (/* check if getting the sets of locally allowed resources is possible */
+      topology->binding_hooks.get_allowed_resources
+      && topology->is_thissystem
+      /* check whether it has been done already */
+      && !(dstatus->flags & HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES)
+      /* check whether it was explicitly requested */
+      && ((topology->flags & HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES) != 0
+	  || ((env = getenv("HWLOC_THISSYSTEM_ALLOWED_RESOURCES")) != NULL && atoi(env)))) {
+    /* OK, get the sets of locally allowed resources */
+    topology->binding_hooks.get_allowed_resources(topology);
+    dstatus->flags |= HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES;
+  }
 
-  /* Backends must allocate root->*nodeset.
-   *
-   * Most of them call hwloc_alloc_obj_cpusets() on the root to do so.
-   * root->complete_nodeset is empty by default, and filled by the core
-   * when NUMA nodes are added with insert_by_cpuset().
-   * root->allowed_nodeset is everything by default, unless reduced by backends.
-   *
-   * The XML backend takes care of everything to properly support old XML input
-   * with missing nodesets and/or NUMA nodes. It checks nodesets and fix them if needed.
+  /* If there's no NUMA node, add one with all the memory.
+   * root->complete_nodeset wouldn't be empty if any NUMA was ever added:
+   * - insert_by_cpuset() adds bits whe PU/NUMA are added.
+   * - XML takes care of sanitizing nodesets.
    */
-  assert(topology->levels[0][0]->nodeset);
-  assert(topology->levels[0][0]->complete_nodeset);
-  assert(topology->levels[0][0]->allowed_nodeset);
-  /* If there's no NUMA node, add one with all the memory */
   if (hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset)) {
-    hwloc_obj_t node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
-    node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset); /* requires root cpuset to be initialized above */
-    node->complete_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->complete_cpuset); /* requires root cpuset to be initialized above */
-    node->allowed_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->allowed_cpuset); /* requires root cpuset to be initialized above */
+    hwloc_obj_t node;
+    hwloc_debug("%s", "\nAdd missing single NUMA node\n");
+    node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, 0);
+    node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset);
     node->nodeset = hwloc_bitmap_alloc();
     /* other nodesets will be filled below */
     hwloc_bitmap_set(node->nodeset, 0);
-    memcpy(&node->memory, &topology->levels[0][0]->memory, sizeof(node->memory));
-    memset(&topology->levels[0][0]->memory, 0, sizeof(node->memory));
+    memcpy(&node->attr->numanode, &topology->machine_memory, sizeof(topology->machine_memory));
+    memset(&topology->machine_memory, 0, sizeof(topology->machine_memory));
     hwloc_insert_object_by_cpuset(topology, node);
+  } else {
+    /* if we're sure we found all NUMA nodes without their sizes (x86 backend?),
+     * we could split topology->total_memory in all of them.
+     */
+    free(topology->machine_memory.page_types);
+    memset(&topology->machine_memory, 0, sizeof(topology->machine_memory));
   }
-  hwloc_debug("%s", "\nPropagate nodesets\n");
-  propagate_nodeset(topology->levels[0][0], NULL);
-  propagate_nodesets(topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nFixup root sets\n");
+  hwloc_bitmap_and(topology->levels[0][0]->cpuset, topology->levels[0][0]->cpuset, topology->levels[0][0]->complete_cpuset);
+  hwloc_bitmap_and(topology->levels[0][0]->nodeset, topology->levels[0][0]->nodeset, topology->levels[0][0]->complete_nodeset);
+
+  hwloc_bitmap_and(topology->allowed_cpuset, topology->allowed_cpuset, topology->levels[0][0]->cpuset);
+  hwloc_bitmap_and(topology->allowed_nodeset, topology->allowed_nodeset, topology->levels[0][0]->nodeset);
+
+  hwloc_debug("%s", "\nPropagate sets\n");
+  /* cpuset are already there thanks to the _by_cpuset insertion,
+   * but nodeset have to be propagated below and above NUMA nodes
+   */
+  propagate_nodeset(topology->levels[0][0]);
+  /* now fixup parent/children sets */
+  fixup_sets(topology->levels[0][0]);
 
   hwloc_debug_print_objects(0, topology->levels[0][0]);
 
-  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED)) {
     hwloc_debug("%s", "\nRemoving unauthorized sets from all sets\n");
-    remove_unused_sets(topology->levels[0][0]);
+    remove_unused_sets(topology, topology->levels[0][0]);
     hwloc_debug_print_objects(0, topology->levels[0][0]);
   }
 
-  /*
-   * All object cpusets and nodesets are properly set now.
-   */
+  /* see if we should ignore the root now that we know how many children it has */
+  if (!hwloc_filter_check_keep_object(topology, topology->levels[0][0])
+      && topology->levels[0][0]->first_child && !topology->levels[0][0]->first_child->next_sibling) {
+    hwloc_obj_t oldroot = topology->levels[0][0];
+    hwloc_obj_t newroot = oldroot->first_child;
+    /* switch to the new root */
+    newroot->parent = NULL;
+    topology->levels[0][0] = newroot;
+    /* move oldroot memory/io/misc children before newroot children */
+    if (oldroot->memory_first_child)
+      prepend_siblings_list(&newroot->memory_first_child, oldroot->memory_first_child, newroot);
+    if (oldroot->io_first_child)
+      prepend_siblings_list(&newroot->io_first_child, oldroot->io_first_child, newroot);
+    if (oldroot->misc_first_child)
+      prepend_siblings_list(&newroot->misc_first_child, oldroot->misc_first_child, newroot);
+    /* destroy oldroot and use the new one */
+    hwloc_free_unlinked_object(oldroot);
+  }
 
   /*
-   * Group levels by distances
+   * All object cpusets and nodesets are properly set now.
    */
-  hwloc_distances_finalize_os(topology);
-  hwloc_group_by_distances(topology);
 
   /* Now connect handy pointers to make remaining discovery easier. */
   hwloc_debug("%s", "\nOk, finished tweaking, now connect\n");
-  if (topology->modified) {
-    hwloc_connect_children(topology->levels[0][0]);
-    if (hwloc_connect_levels(topology) < 0)
-      return -1;
-    topology->modified = 0;
-  }
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    return -1;
   hwloc_debug_print_objects(0, topology->levels[0][0]);
 
   /*
-   * Additional discovery with other backends
+   * Additional discovery
    */
-
-  backend = topology->backends;
-  while (NULL != backend) {
-    int err;
-    if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_CPU
-	|| backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
-      /* already done above */
-      goto next_noncpubackend;
-    if (!backend->discover)
-      goto next_noncpubackend;
-
-    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
-      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
-      hwloc_connect_children(topology->levels[0][0]);
-      if (hwloc_connect_levels(topology) < 0)
-	return -1;
-      topology->modified = 0;
-    }
-
-    err = backend->discover(backend);
-    if (err >= 0) {
-      gotsomeio += err;
-    }
-    hwloc_debug_print_objects(0, topology->levels[0][0]);
-
-next_noncpubackend:
-    backend = backend->next;
+  if (topology->backend_phases & HWLOC_DISC_PHASE_PCI) {
+    dstatus->phase = HWLOC_DISC_PHASE_PCI;
+    hwloc_discover_by_phase(topology, dstatus, "PCI");
   }
-
-  /* if we got anything, filter interesting objects and update the tree */
-  if (gotsomeio) {
-    if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
-      /* drop all I/O children */
-      hwloc_drop_all_io(topology, topology->levels[0][0]);
-    else
-      hwloc_drop_useless_io(topology, topology->levels[0][0]);
-    hwloc_debug("%s", "\nNow reconnecting\n");
-    hwloc_debug_print_objects(0, topology->levels[0][0]);
-    hwloc_propagate_bridge_depth(topology, topology->levels[0][0], 0);
+  if (topology->backend_phases & HWLOC_DISC_PHASE_IO) {
+    dstatus->phase = HWLOC_DISC_PHASE_IO;
+    hwloc_discover_by_phase(topology, dstatus, "IO");
+  }
+  if (topology->backend_phases & HWLOC_DISC_PHASE_MISC) {
+    dstatus->phase = HWLOC_DISC_PHASE_MISC;
+    hwloc_discover_by_phase(topology, dstatus, "MISC");
   }
+  if (topology->backend_phases & HWLOC_DISC_PHASE_ANNOTATE) {
+    dstatus->phase = HWLOC_DISC_PHASE_ANNOTATE;
+    hwloc_discover_by_phase(topology, dstatus, "ANNOTATE");
+  }
+
+  if (getenv("HWLOC_DEBUG_SORT_CHILDREN"))
+    hwloc_debug_sort_children(topology->levels[0][0]);
 
   /* Remove some stuff */
 
-  hwloc_debug("%s", "\nRemoving ignored objects\n");
-  ignore_type_always(topology, &topology->levels[0][0]);
+  hwloc_debug("%s", "\nRemoving bridge objects if needed\n");
+  hwloc_filter_bridges(topology, topology->levels[0][0]);
   hwloc_debug_print_objects(0, topology->levels[0][0]);
 
-  hwloc_debug("%s", "\nRemoving empty objects except numa nodes and PCI devices\n");
+  hwloc_debug("%s", "\nRemoving empty objects\n");
   remove_empty(topology, &topology->levels[0][0]);
-    if (!topology->levels[0][0]) {
+  if (!topology->levels[0][0]) {
     fprintf(stderr, "Topology became empty, aborting!\n");
-    abort();
+    return -1;
+  }
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->cpuset)) {
+    fprintf(stderr, "Topology does not contain any PU, aborting!\n");
+    return -1;
+  }
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->nodeset)) {
+    fprintf(stderr, "Topology does not contain any NUMA node, aborting!\n");
+    return -1;
   }
   hwloc_debug_print_objects(0, topology->levels[0][0]);
 
-  hwloc_debug("%s", "\nRemoving objects whose type has HWLOC_IGNORE_TYPE_KEEP_STRUCTURE and have only one child or are the only child\n");
-  ignore_type_keep_structure(topology, &topology->levels[0][0]);
-  hwloc_debug_print_objects(0, topology->levels[0][0]);
+  /* Reconnect things after all these changes.
+   * Often needed because of Groups inserted for I/Os.
+   * And required for KEEP_STRUCTURE below.
+   */
+  if (hwloc_topology_reconnect(topology, 0) < 0)
+    return -1;
 
-  /* Reconnect things after all these changes */
-  if (topology->modified) {
-    /* Often raised because of Groups inserted for I/Os */
-    hwloc_connect_children(topology->levels[0][0]);
-    if (hwloc_connect_levels(topology) < 0)
-      return -1;
-    topology->modified = 0;
-  }
+  hwloc_debug("%s", "\nRemoving levels with HWLOC_TYPE_FILTER_KEEP_STRUCTURE\n");
+  hwloc_filter_levels_keep_structure(topology);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
 
   /* accumulate children memory in total_memory fields (only once parent is set) */
   hwloc_debug("%s", "\nPropagate total memory up\n");
   propagate_total_memory(topology->levels[0][0]);
 
-  /*
-   * Now that objects are numbered, take distance matrices from backends and put them in the main topology.
-   *
-   * Some objects may have disappeared (in removed_empty or removed_ignored) since we setup os distances
-   * (hwloc_distances_finalize_os()) above. Reset them so as to not point to disappeared objects anymore.
-   */
-  hwloc_distances_restrict_os(topology);
-  hwloc_distances_finalize_os(topology);
-  hwloc_distances_finalize_logical(topology);
+  /* setup the symmetric_subtree attribute */
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+
+  /* apply group depths */
+  hwloc_set_group_depth(topology);
 
   /* add some identification attributes if not loading from XML */
   if (topology->backends
-      && strcmp(topology->backends->component->name, "xml")) {
+      && strcmp(topology->backends->component->name, "xml")
+      && !getenv("HWLOC_DONT_ADD_VERSION_INFO")) {
     char *value;
     /* add a hwlocVersion */
-    hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", VERSION);
+    hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", HWLOC_VERSION);
     /* add a ProcessName */
     value = hwloc_progname(topology);
     if (value) {
@@ -2621,16 +3440,10 @@ hwloc_discover(struct hwloc_topology *topology)
     }
   }
 
-  /*
-   * Now set binding hooks according to topology->is_thissystem
-   * what the native OS backend offers.
-   */
-  hwloc_set_binding_hooks(topology);
-
   return 0;
 }
 
-/* To be before discovery is actually launched,
+/* To be called before discovery is actually launched,
  * Resets everything in case a previous load initialized some stuff.
  */
 void
@@ -2645,40 +3458,66 @@ hwloc_topology_setup_defaults(struct hwloc_topology *topology)
   memset(topology->support.membind, 0, sizeof(*topology->support.membind));
 
   /* Only the System object on top by default */
+  topology->next_gp_index = 1; /* keep 0 as an invalid value */
   topology->nb_levels = 1; /* there's at least SYSTEM */
-  topology->next_group_depth = 0;
-  topology->levels[0] = malloc (sizeof (hwloc_obj_t));
+  topology->levels[0] = hwloc_tma_malloc (topology->tma, sizeof (hwloc_obj_t));
   topology->level_nbobjects[0] = 1;
-  /* NULLify other levels so that we can detect and free old ones in hwloc_connect_levels() if needed */
-  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
-  topology->bridge_level = NULL;
-  topology->pcidev_level = NULL;
-  topology->osdev_level = NULL;
-  topology->first_bridge = topology->last_bridge = NULL;
-  topology->first_pcidev = topology->last_pcidev = NULL;
-  topology->first_osdev = topology->last_osdev = NULL;
-  topology->misc_level = NULL;
-  topology->first_misc = topology->last_misc = NULL;
+
+  /* Machine-wide memory */
+  topology->machine_memory.local_memory = 0;
+  topology->machine_memory.page_types_len = 0;
+  topology->machine_memory.page_types = NULL;
+
+  /* Allowed stuff */
+  topology->allowed_cpuset = NULL;
+  topology->allowed_nodeset = NULL;
+
+  /* NULLify other special levels */
+  memset(&topology->slevels, 0, sizeof(topology->slevels));
+  /* assert the indexes of special levels */
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_NUMANODE == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_NUMANODE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_MISC == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_MISC));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_BRIDGE == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_BRIDGE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_PCIDEV == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_PCI_DEVICE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_OSDEV == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_OS_DEVICE));
+  HWLOC_BUILD_ASSERT(HWLOC_SLEVEL_MEMCACHE == HWLOC_SLEVEL_FROM_DEPTH(HWLOC_TYPE_DEPTH_MEMCACHE));
+
+  /* sane values to type_depth */
+  hwloc_reset_normal_type_depths(topology);
+  topology->type_depth[HWLOC_OBJ_NUMANODE] = HWLOC_TYPE_DEPTH_NUMANODE;
+  topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
+  topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
+  topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+  topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+  topology->type_depth[HWLOC_OBJ_MEMCACHE] = HWLOC_TYPE_DEPTH_MEMCACHE;
 
   /* Create the actual machine object, but don't touch its attributes yet
    * since the OS backend may still change the object into something else
    * (for instance System)
    */
-  root_obj = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, 0);
+  root_obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MACHINE, 0);
   topology->levels[0][0] = root_obj;
 }
 
-int
-hwloc_topology_init (struct hwloc_topology **topologyp)
+static void hwloc__topology_filter_init(struct hwloc_topology *topology);
+
+/* This function may use a tma, it cannot free() or realloc() */
+static int
+hwloc__topology_init (struct hwloc_topology **topologyp,
+		      unsigned nblevels,
+		      struct hwloc_tma *tma)
 {
   struct hwloc_topology *topology;
-  int i;
 
-  topology = malloc (sizeof (struct hwloc_topology));
+  topology = hwloc_tma_malloc (tma, sizeof (struct hwloc_topology));
   if(!topology)
     return -1;
 
-  hwloc_components_init(topology);
+  topology->tma = tma;
+
+  hwloc_components_init(); /* uses malloc without tma, but won't need it since dup() caller already took a reference */
+  hwloc_topology_components_init(topology);
+  hwloc_pci_discovery_init(topology); /* make sure both dup() and load() get sane variables */
 
   /* Setup topology context */
   topology->is_loaded = 0;
@@ -2686,20 +3525,25 @@ hwloc_topology_init (struct hwloc_topology **topologyp)
   topology->is_thissystem = 1;
   topology->pid = 0;
   topology->userdata = NULL;
+  topology->topology_abi = HWLOC_TOPOLOGY_ABI;
+  topology->adopted_shmem_addr = NULL;
+  topology->adopted_shmem_length = 0;
 
-  topology->support.discovery = malloc(sizeof(*topology->support.discovery));
-  topology->support.cpubind = malloc(sizeof(*topology->support.cpubind));
-  topology->support.membind = malloc(sizeof(*topology->support.membind));
+  topology->support.discovery = hwloc_tma_malloc(tma, sizeof(*topology->support.discovery));
+  topology->support.cpubind = hwloc_tma_malloc(tma, sizeof(*topology->support.cpubind));
+  topology->support.membind = hwloc_tma_malloc(tma, sizeof(*topology->support.membind));
 
-  /* Only ignore useless cruft by default */
-  for(i = HWLOC_OBJ_SYSTEM; i < HWLOC_OBJ_TYPE_MAX; i++)
-    topology->ignored_types[i] = HWLOC_IGNORE_TYPE_NEVER;
-  topology->ignored_types[HWLOC_OBJ_GROUP] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  topology->nb_levels_allocated = nblevels; /* enough for default 10 levels = Mach+Pack+Die+NUMA+L3+L2+L1d+L1i+Co+PU */
+  topology->levels = hwloc_tma_calloc(tma, topology->nb_levels_allocated * sizeof(*topology->levels));
+  topology->level_nbobjects = hwloc_tma_calloc(tma, topology->nb_levels_allocated * sizeof(*topology->level_nbobjects));
 
-  hwloc_distances_init(topology);
+  hwloc__topology_filter_init(topology);
+
+  hwloc_internal_distances_init(topology);
 
   topology->userdata_export_cb = NULL;
   topology->userdata_import_cb = NULL;
+  topology->userdata_not_decoded = 0;
 
   /* Make the topology look like something coherent but empty */
   hwloc_topology_setup_defaults(topology);
@@ -2708,10 +3552,23 @@ hwloc_topology_init (struct hwloc_topology **topologyp)
   return 0;
 }
 
+int
+hwloc_topology_init (struct hwloc_topology **topologyp)
+{
+  return hwloc__topology_init(topologyp,
+			      16, /* 16 is enough for default 10 levels = Mach+Pack+Die+NUMA+L3+L2+L1d+L1i+Co+PU */
+			      NULL); /* no TMA for normal topologies, too many allocations to fix */
+}
+
 int
 hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
                        hwloc_pid_t pid __hwloc_attribute_unused)
 {
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
   /* this does *not* change the backend */
 #ifdef HWLOC_LINUX_SYS
   topology->pid = pid;
@@ -2725,9 +3582,14 @@ hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
 int
 hwloc_topology_set_synthetic(struct hwloc_topology *topology, const char *description)
 {
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
   return hwloc_disc_component_force_enable(topology,
 					   0 /* api */,
-					   -1, "synthetic",
+					   "synthetic",
 					   description, NULL, NULL);
 }
 
@@ -2735,9 +3597,14 @@ int
 hwloc_topology_set_xml(struct hwloc_topology *topology,
 		       const char *xmlpath)
 {
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
   return hwloc_disc_component_force_enable(topology,
 					   0 /* api */,
-					   -1, "xml",
+					   "xml",
 					   xmlpath, NULL, NULL);
 }
 
@@ -2746,9 +3613,14 @@ hwloc_topology_set_xmlbuffer(struct hwloc_topology *topology,
                              const char *xmlbuffer,
                              int size)
 {
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
   return hwloc_disc_component_force_enable(topology,
 					   0 /* api */,
-					   -1, "xml", NULL,
+					   "xml", NULL,
 					   xmlbuffer, (void*) (uintptr_t) size);
 }
 
@@ -2760,6 +3632,12 @@ hwloc_topology_set_flags (struct hwloc_topology *topology, unsigned long flags)
     errno = EBUSY;
     return -1;
   }
+
+  if (flags & ~(HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED|HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM|HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES)) {
+    errno = EINVAL;
+    return -1;
+  }
+
   topology->flags = flags;
   return 0;
 }
@@ -2770,115 +3648,154 @@ hwloc_topology_get_flags (struct hwloc_topology *topology)
   return topology->flags;
 }
 
-int
-hwloc_topology_ignore_type(struct hwloc_topology *topology, hwloc_obj_type_t type)
+static void
+hwloc__topology_filter_init(struct hwloc_topology *topology)
 {
-  if (type >= HWLOC_OBJ_TYPE_MAX) {
-    errno = EINVAL;
-    return -1;
-  }
+  hwloc_obj_type_t type;
+  /* Only ignore useless cruft by default */
+  for(type = HWLOC_OBJ_TYPE_MIN; type < HWLOC_OBJ_TYPE_MAX; type++)
+    topology->type_filter[type] = HWLOC_TYPE_FILTER_KEEP_ALL;
+  topology->type_filter[HWLOC_OBJ_L1ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_L2ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_L3ICACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_MEMCACHE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_GROUP] = HWLOC_TYPE_FILTER_KEEP_STRUCTURE;
+  topology->type_filter[HWLOC_OBJ_MISC] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+  topology->type_filter[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_FILTER_KEEP_NONE;
+}
 
-  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE) {
-    /* we need the PU and NUMA levels */
-    errno = EINVAL;
-    return -1;
-  } else if (hwloc_obj_type_is_io(type)) {
-    /* I/O devices aren't in any level, use topology flags to ignore them */
-    errno = EINVAL;
-    return -1;
+static int
+hwloc__topology_set_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter)
+{
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MACHINE) {
+    if (filter != HWLOC_TYPE_FILTER_KEEP_ALL) {
+      /* we need the Machine, PU and NUMA levels */
+      errno = EINVAL;
+      return -1;
+    }
+  } else if (hwloc__obj_type_is_special(type)) {
+    if (filter == HWLOC_TYPE_FILTER_KEEP_STRUCTURE) {
+      /* I/O and Misc are outside of the main topology structure, makes no sense. */
+      errno = EINVAL;
+      return -1;
+    }
+  } else if (type == HWLOC_OBJ_GROUP) {
+    if (filter == HWLOC_TYPE_FILTER_KEEP_ALL) {
+      /* Groups are always ignored, at least keep_structure */
+      errno = EINVAL;
+      return -1;
+    }
   }
 
-  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_ALWAYS;
+  /* "important" just means "all" for non-I/O non-Misc */
+  if (!hwloc__obj_type_is_special(type) && filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT)
+    filter = HWLOC_TYPE_FILTER_KEEP_ALL;
+
+  topology->type_filter[type] = filter;
   return 0;
 }
 
 int
-hwloc_topology_ignore_type_keep_structure(struct hwloc_topology *topology, hwloc_obj_type_t type)
+hwloc_topology_set_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter)
 {
-  if (type >= HWLOC_OBJ_TYPE_MAX) {
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX) {
     errno = EINVAL;
     return -1;
   }
-
-  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MISC) {
-    /* We need the PU and NUMA levels.
-     * Misc are outside of the main topology structure, makes no sense.
-     */
-    errno = EINVAL;
+  if (topology->is_loaded) {
+    errno = EBUSY;
     return -1;
-  } else if (hwloc_obj_type_is_io(type)) {
-    /* I/O devices aren't in any level, use topology flags to ignore them */
-    errno = EINVAL;
+  }
+  return hwloc__topology_set_type_filter(topology, type, filter);
+}
+
+int
+hwloc_topology_set_all_types_filter(struct hwloc_topology *topology, enum hwloc_type_filter_e filter)
+{
+  hwloc_obj_type_t type;
+  if (topology->is_loaded) {
+    errno = EBUSY;
     return -1;
   }
+  for(type = HWLOC_OBJ_TYPE_MIN; type < HWLOC_OBJ_TYPE_MAX; type++)
+    hwloc__topology_set_type_filter(topology, type, filter);
+  return 0;
+}
 
-  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+int
+hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
+{
+  unsigned i;
+  for(i=HWLOC_OBJ_L1CACHE; i<HWLOC_OBJ_L3ICACHE; i++)
+    hwloc_topology_set_type_filter(topology, (hwloc_obj_type_t) i, filter);
   return 0;
 }
 
 int
-hwloc_topology_ignore_all_keep_structure(struct hwloc_topology *topology)
+hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
 {
-  unsigned type;
-  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++)
-    if (type != HWLOC_OBJ_PU && type != HWLOC_OBJ_NUMANODE
-	&& !hwloc_obj_type_is_io((hwloc_obj_type_t) type))
-      topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  unsigned i;
+  for(i=HWLOC_OBJ_L1ICACHE; i<HWLOC_OBJ_L3ICACHE; i++)
+    hwloc_topology_set_type_filter(topology, (hwloc_obj_type_t) i, filter);
   return 0;
 }
 
-/* traverse the tree and free everything.
- * only use first_child/next_sibling so that it works before load()
- * and may be used when switching between backend.
- */
-static void
-hwloc_topology_clear_tree (struct hwloc_topology *topology, struct hwloc_obj *root)
+int
+hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter)
 {
-  hwloc_obj_t child;
-  child = root->first_child;
-  while (child) {
-    hwloc_obj_t nextchild = child->next_sibling;
-    hwloc_topology_clear_tree (topology, child);
-    child = nextchild;
-  }
-  child = root->io_first_child;
-  while (child) {
-    hwloc_obj_t nextchild = child->next_sibling;
-    hwloc_topology_clear_tree (topology, child);
-    child = nextchild;
-  }
-  child = root->misc_first_child;
-  while (child) {
-    hwloc_obj_t nextchild = child->next_sibling;
-    hwloc_topology_clear_tree (topology, child);
-    child = nextchild;
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_BRIDGE, filter);
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, filter);
+  hwloc_topology_set_type_filter(topology, HWLOC_OBJ_OS_DEVICE, filter);
+  return 0;
+}
+
+int
+hwloc_topology_get_type_filter(struct hwloc_topology *topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filterp)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
   }
-  hwloc_free_unlinked_object (root);
+  *filterp = topology->type_filter[type];
+  return 0;
 }
 
 void
 hwloc_topology_clear (struct hwloc_topology *topology)
 {
+  /* no need to set to NULL after free() since callers will call setup_defaults() or just destroy the rest of the topology */
   unsigned l;
-  hwloc_topology_clear_tree (topology, topology->levels[0][0]);
-  for (l=0; l<topology->nb_levels; l++) {
+  hwloc_internal_distances_destroy(topology);
+  hwloc_free_object_and_children(topology->levels[0][0]);
+  hwloc_bitmap_free(topology->allowed_cpuset);
+  hwloc_bitmap_free(topology->allowed_nodeset);
+  for (l=0; l<topology->nb_levels; l++)
     free(topology->levels[l]);
-    topology->levels[l] = NULL;
-  }
-  free(topology->bridge_level);
-  free(topology->pcidev_level);
-  free(topology->osdev_level);
-  free(topology->misc_level);
+  for(l=0; l<HWLOC_NR_SLEVELS; l++)
+    free(topology->slevels[l].objs);
+  free(topology->machine_memory.page_types);
 }
 
 void
 hwloc_topology_destroy (struct hwloc_topology *topology)
 {
+  if (topology->adopted_shmem_addr) {
+    hwloc__topology_disadopt(topology);
+    return;
+  }
+
   hwloc_backends_disable_all(topology);
-  hwloc_components_destroy_all(topology);
+  hwloc_topology_components_fini(topology);
+  hwloc_components_fini();
 
   hwloc_topology_clear(topology);
-  hwloc_distances_destroy(topology);
+
+  free(topology->levels);
+  free(topology->level_nbobjects);
 
   free(topology->support.discovery);
   free(topology->support.cpubind);
@@ -2889,6 +3806,8 @@ hwloc_topology_destroy (struct hwloc_topology *topology)
 int
 hwloc_topology_load (struct hwloc_topology *topology)
 {
+  struct hwloc_disc_status dstatus;
+  const char *env;
   int err;
 
   if (topology->is_loaded) {
@@ -2896,180 +3815,472 @@ hwloc_topology_load (struct hwloc_topology *topology)
     return -1;
   }
 
-  /* Only apply variables if we have not changed the backend yet.
-   * Only the last one will be kept.
-   * Check for XML last (that's the one that may be set system-wide by administrators)
-   * so that it's only used if other variables are not set,
-   * to allow users to override easily.
-   */
-  if (!topology->backends) {
-    const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
-    if (synthetic_env)
-      hwloc_disc_component_force_enable(topology,
-					1 /* env force */,
-					-1, "synthetic",
-					synthetic_env, NULL, NULL);
-  }
-  if (!topology->backends) {
-    const char *fsroot_path_env = getenv("HWLOC_FSROOT");
-    if (fsroot_path_env)
-      hwloc_disc_component_force_enable(topology,
-					1 /* env force */,
-					HWLOC_DISC_COMPONENT_TYPE_CPU, "linux",
-					fsroot_path_env, NULL, NULL);
-  }
-  if (!topology->backends) {
-    const char *xmlpath_env = getenv("HWLOC_XMLFILE");
-    if (xmlpath_env)
-      hwloc_disc_component_force_enable(topology,
-					1 /* env force */,
-					-1, "xml",
-					xmlpath_env, NULL, NULL);
+  hwloc_internal_distances_prepare(topology);
+
+  if (getenv("HWLOC_XML_USERDATA_NOT_DECODED"))
+    topology->userdata_not_decoded = 1;
+
+  /* Ignore variables if HWLOC_COMPONENTS is set. It will be processed later */
+  if (!getenv("HWLOC_COMPONENTS")) {
+    /* Only apply variables if we have not changed the backend yet.
+     * Only the first one will be kept.
+     * Check for FSROOT first since it's for debugging so likely needs to override everything else.
+     * Check for XML last (that's the one that may be set system-wide by administrators)
+     * so that it's only used if other variables are not set,
+     * to allow users to override easily.
+     */
+    if (!topology->backends) {
+      const char *fsroot_path_env = getenv("HWLOC_FSROOT");
+      if (fsroot_path_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  "linux",
+					  NULL /* backend will getenv again */, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *cpuid_path_env = getenv("HWLOC_CPUID_PATH");
+      if (cpuid_path_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  "x86",
+					  NULL /* backend will getenv again */, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
+      if (synthetic_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  "synthetic",
+					  synthetic_env, NULL, NULL);
+    }
+    if (!topology->backends) {
+      const char *xmlpath_env = getenv("HWLOC_XMLFILE");
+      if (xmlpath_env)
+	hwloc_disc_component_force_enable(topology,
+					  1 /* env force */,
+					  "xml",
+					  xmlpath_env, NULL, NULL);
+    }
   }
 
+  dstatus.excluded_phases = 0;
+  dstatus.flags = 0; /* did nothing yet */
+
+  env = getenv("HWLOC_ALLOW");
+  if (env && !strcmp(env, "all"))
+    /* don't retrieve the sets of allowed resources */
+    dstatus.flags |= HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES;
+
   /* instantiate all possible other backends now */
   hwloc_disc_components_enable_others(topology);
-  /* now that backends are enabled, update the thissystem flag */
+  /* now that backends are enabled, update the thissystem flag and some callbacks */
   hwloc_backends_is_thissystem(topology);
-
-  /* get distance matrix from the environment are store them (as indexes) in the topology.
-   * indexes will be converted into objects later once the tree will be filled
+  hwloc_backends_find_callbacks(topology);
+  /*
+   * Now set binding hooks according to topology->is_thissystem
+   * and what the native OS backend offers.
    */
-  hwloc_distances_set_from_env(topology);
+  hwloc_set_binding_hooks(topology);
+
+  hwloc_pci_discovery_prepare(topology);
 
   /* actual topology discovery */
-  err = hwloc_discover(topology);
+  err = hwloc_discover(topology, &dstatus);
   if (err < 0)
     goto out;
 
+  hwloc_pci_discovery_exit(topology);
+
 #ifndef HWLOC_DEBUG
   if (getenv("HWLOC_DEBUG_CHECK"))
 #endif
     hwloc_topology_check(topology);
 
+  /* Mark distances objs arrays as invalid since we may have removed objects
+   * from the topology after adding the distances (remove_empty, etc).
+   * It would be hard to actually verify whether it's needed.
+   */
+  hwloc_internal_distances_invalidate_cached_objs(topology);
+  /* And refresh distances so that multithreaded concurrent distances_get()
+   * don't refresh() concurrently (disallowed).
+   */
+  hwloc_internal_distances_refresh(topology);
+
   topology->is_loaded = 1;
+
+  if (topology->backend_phases & HWLOC_DISC_PHASE_TWEAK) {
+    dstatus.phase = HWLOC_DISC_PHASE_TWEAK;
+    hwloc_discover_by_phase(topology, &dstatus, "TWEAK");
+  }
+
   return 0;
 
  out:
+  hwloc_pci_discovery_exit(topology);
   hwloc_topology_clear(topology);
-  hwloc_distances_destroy(topology);
   hwloc_topology_setup_defaults(topology);
   hwloc_backends_disable_all(topology);
   return -1;
 }
 
 /* adjust object cpusets according the given droppedcpuset,
- * drop object whose cpuset becomes empty,
- * and mark dropped nodes in droppednodeset
+ * drop object whose cpuset becomes empty and that have no children,
+ * and propagate NUMA node removal as nodeset changes in parents.
  */
 static void
-restrict_object(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj, hwloc_const_cpuset_t droppedcpuset, hwloc_nodeset_t droppednodeset, int droppingparent)
+restrict_object_by_cpuset(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj,
+			  hwloc_bitmap_t droppedcpuset, hwloc_bitmap_t droppednodeset)
 {
   hwloc_obj_t obj = *pobj, child, *pchild;
-  int dropping;
-  int modified = hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset);
-
-  hwloc_clear_object_distances(obj);
-
-  hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
-  hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
-  hwloc_bitmap_andnot(obj->allowed_cpuset, obj->allowed_cpuset, droppedcpuset);
+  int modified = 0;
 
-  dropping = droppingparent || hwloc_bitmap_iszero(obj->cpuset);
+  if (hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset)) {
+    hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+    hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+    modified = 1;
+  } else {
+    if ((flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS)
+	&& hwloc_bitmap_iszero(obj->complete_cpuset)) {
+      /* we're empty, there's a NUMAnode below us, it'll be removed this time */
+      modified = 1;
+    }
+    /* nodeset cannot intersect unless cpuset intersects or is empty */
+    if (droppednodeset)
+      assert(!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset)
+	     || hwloc_bitmap_iszero(obj->complete_cpuset));
+  }
+  if (droppednodeset) {
+    hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+    hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+  }
 
   if (modified) {
     for_each_child_safe(child, obj, pchild)
-      restrict_object(topology, flags, pchild, droppedcpuset, droppednodeset, dropping);
+      restrict_object_by_cpuset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    /* if some hwloc_bitmap_first(child->complete_cpuset) changed, children might need to be reordered */
+    hwloc__reorder_children(obj);
+
+    for_each_memory_child_safe(child, obj, pchild)
+      restrict_object_by_cpuset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    /* local NUMA nodes have the same cpusets, no need to reorder them */
+
     /* Nothing to restrict under I/O or Misc */
   }
 
-  if (dropping) {
-    hwloc_debug("%s", "\nRemoving object during restrict");
+  if (!obj->first_child && !obj->memory_first_child /* arity not updated before connect_children() */
+      && hwloc_bitmap_iszero(obj->cpuset)
+      && (obj->type != HWLOC_OBJ_NUMANODE || (flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS))) {
+    /* remove object */
+    hwloc_debug("%s", "\nRemoving object during restrict by cpuset");
     hwloc_debug_print_object(0, obj);
-    if (obj->type == HWLOC_OBJ_NUMANODE)
-      hwloc_bitmap_set(droppednodeset, obj->os_index);
-    if (obj->io_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO))
-      unlink_and_free_object_and_children(&obj->io_first_child);
-    if (obj->misc_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC))
-      unlink_and_free_object_and_children(&obj->misc_first_child);
+
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO)) {
+      hwloc_free_object_siblings_and_children(obj->io_first_child);
+      obj->io_first_child = NULL;
+    }
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC)) {
+      hwloc_free_object_siblings_and_children(obj->misc_first_child);
+      obj->misc_first_child = NULL;
+    }
+    assert(!obj->first_child);
+    assert(!obj->memory_first_child);
     unlink_and_free_single_object(pobj);
     topology->modified = 1;
-    /* do not remove children. if they were to be removed, they would have been already */
   }
 }
 
-/* adjust object nodesets accordingly the given droppednodeset
+/* adjust object nodesets according the given droppednodeset,
+ * drop object whose nodeset becomes empty and that have no children,
+ * and propagate PU removal as cpuset changes in parents.
  */
 static void
-restrict_object_nodeset(hwloc_topology_t topology, hwloc_obj_t *pobj, hwloc_nodeset_t droppednodeset)
+restrict_object_by_nodeset(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj,
+			   hwloc_bitmap_t droppedcpuset, hwloc_bitmap_t droppednodeset)
 {
   hwloc_obj_t obj = *pobj, child, *pchild;
+  int modified = 0;
 
-  /* if this object isn't modified, don't bother looking at children */
-  if (!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset))
-    return;
+  if (hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset)) {
+    hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+    hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+    modified = 1;
+  } else {
+    if ((flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS)
+	&& hwloc_bitmap_iszero(obj->complete_nodeset)) {
+      /* we're empty, there's a PU below us, it'll be removed this time */
+      modified = 1;
+    }
+    /* cpuset cannot intersect unless nodeset intersects or is empty */
+    if (droppedcpuset)
+      assert(!hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset)
+	     || hwloc_bitmap_iszero(obj->complete_nodeset));
+  }
+  if (droppedcpuset) {
+    hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+    hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+  }
+
+  if (modified) {
+    for_each_child_safe(child, obj, pchild)
+      restrict_object_by_nodeset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    if (flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS)
+      /* cpuset may have changed above where some NUMA nodes were removed.
+       * if some hwloc_bitmap_first(child->complete_cpuset) changed, children might need to be reordered */
+      hwloc__reorder_children(obj);
 
-  hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
-  hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
-  hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, droppednodeset);
+    for_each_memory_child_safe(child, obj, pchild)
+      restrict_object_by_nodeset(topology, flags, pchild, droppedcpuset, droppednodeset);
+    /* FIXME: we may have to reorder CPU-less groups of NUMA nodes if some of their nodes were removed */
 
-  for_each_child_safe(child, obj, pchild)
-    restrict_object_nodeset(topology, pchild, droppednodeset);
-  /* Nothing to restrict under I/O and Misc */
+    /* Nothing to restrict under I/O or Misc */
+  }
+
+  if (!obj->first_child && !obj->memory_first_child /* arity not updated before connect_children() */
+      && hwloc_bitmap_iszero(obj->nodeset)
+      && (obj->type != HWLOC_OBJ_PU || (flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS))) {
+    /* remove object */
+    hwloc_debug("%s", "\nRemoving object during restrict by nodeset");
+    hwloc_debug_print_object(0, obj);
+
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO)) {
+      hwloc_free_object_siblings_and_children(obj->io_first_child);
+      obj->io_first_child = NULL;
+    }
+    if (!(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC)) {
+      hwloc_free_object_siblings_and_children(obj->misc_first_child);
+      obj->misc_first_child = NULL;
+    }
+    assert(!obj->first_child);
+    assert(!obj->memory_first_child);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+  }
 }
 
 int
-hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_cpuset_t cpuset, unsigned long flags)
+hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_bitmap_t set, unsigned long flags)
 {
   hwloc_bitmap_t droppedcpuset, droppednodeset;
 
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return -1;
+  }
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    return -1;
+  }
+
+  if (flags & ~(HWLOC_RESTRICT_FLAG_REMOVE_CPULESS
+		|HWLOC_RESTRICT_FLAG_ADAPT_MISC|HWLOC_RESTRICT_FLAG_ADAPT_IO
+		|HWLOC_RESTRICT_FLAG_BYNODESET|HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (flags & HWLOC_RESTRICT_FLAG_BYNODESET) {
+    /* cannot use CPULESS with BYNODESET */
+    if (flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS) {
+      errno = EINVAL;
+      return -1;
+    }
+  } else {
+    /* cannot use MEMLESS without BYNODESET */
+    if (flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS) {
+      errno = EINVAL;
+      return -1;
+    }
+  }
+
   /* make sure we'll keep something in the topology */
-  if (!hwloc_bitmap_intersects(cpuset, topology->levels[0][0]->cpuset)) {
+  if (((flags & HWLOC_RESTRICT_FLAG_BYNODESET) && !hwloc_bitmap_intersects(set, topology->allowed_nodeset))
+      || (!(flags & HWLOC_RESTRICT_FLAG_BYNODESET) && !hwloc_bitmap_intersects(set, topology->allowed_cpuset))) {
     errno = EINVAL; /* easy failure, just don't touch the topology */
     return -1;
   }
 
   droppedcpuset = hwloc_bitmap_alloc();
   droppednodeset = hwloc_bitmap_alloc();
+  if (!droppedcpuset || !droppednodeset) {
+    hwloc_bitmap_free(droppedcpuset);
+    hwloc_bitmap_free(droppednodeset);
+    return -1;
+  }
+
+  if (flags & HWLOC_RESTRICT_FLAG_BYNODESET) {
+    /* nodeset to clear */
+    hwloc_bitmap_not(droppednodeset, set);
+    /* cpuset to clear */
+    if (flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS) {
+      hwloc_obj_t pu = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0);
+      do {
+	/* PU will be removed if cpuset gets or was empty */
+	if (hwloc_bitmap_iszero(pu->cpuset)
+	    || hwloc_bitmap_isincluded(pu->nodeset, droppednodeset))
+	  hwloc_bitmap_set(droppedcpuset, pu->os_index);
+	pu = pu->next_cousin;
+      } while (pu);
+
+      /* check we're not removing all PUs */
+      if (hwloc_bitmap_isincluded(topology->allowed_cpuset, droppedcpuset)) {
+	errno = EINVAL; /* easy failure, just don't touch the topology */
+	hwloc_bitmap_free(droppedcpuset);
+	hwloc_bitmap_free(droppednodeset);
+	return -1;
+      }
+    }
+    /* remove cpuset if empty */
+    if (!(flags & HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS)
+	|| hwloc_bitmap_iszero(droppedcpuset)) {
+      hwloc_bitmap_free(droppedcpuset);
+      droppedcpuset = NULL;
+    }
+
+    /* now recurse to filter sets and drop things */
+    restrict_object_by_nodeset(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset);
+    hwloc_bitmap_andnot(topology->allowed_nodeset, topology->allowed_nodeset, droppednodeset);
+    if (droppedcpuset)
+      hwloc_bitmap_andnot(topology->allowed_cpuset, topology->allowed_cpuset, droppedcpuset);
+
+  } else {
+    /* cpuset to clear */
+    hwloc_bitmap_not(droppedcpuset, set);
+    /* nodeset to clear */
+    if (flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS) {
+      hwloc_obj_t node = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, 0);
+      do {
+	/* node will be removed if nodeset gets or was empty */
+	if (hwloc_bitmap_iszero(node->cpuset)
+	    || hwloc_bitmap_isincluded(node->cpuset, droppedcpuset))
+	  hwloc_bitmap_set(droppednodeset, node->os_index);
+	node = node->next_cousin;
+      } while (node);
+
+      /* check we're not removing all NUMA nodes */
+      if (hwloc_bitmap_isincluded(topology->allowed_nodeset, droppednodeset)) {
+	errno = EINVAL; /* easy failure, just don't touch the topology */
+	hwloc_bitmap_free(droppedcpuset);
+	hwloc_bitmap_free(droppednodeset);
+	return -1;
+      }
+    }
+    /* remove nodeset if empty */
+    if (!(flags & HWLOC_RESTRICT_FLAG_REMOVE_CPULESS)
+	|| hwloc_bitmap_iszero(droppednodeset)) {
+      hwloc_bitmap_free(droppednodeset);
+      droppednodeset = NULL;
+    }
 
-  /* drop object based on the reverse of cpuset, and fill the 'dropped' nodeset */
-  hwloc_bitmap_not(droppedcpuset, cpuset);
-  restrict_object(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset, 0 /* root cannot be removed */);
-  /* update nodesets according to dropped nodeset */
-  restrict_object_nodeset(topology, &topology->levels[0][0], droppednodeset);
+    /* now recurse to filter sets and drop things */
+    restrict_object_by_cpuset(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset);
+    hwloc_bitmap_andnot(topology->allowed_cpuset, topology->allowed_cpuset, droppedcpuset);
+    if (droppednodeset)
+      hwloc_bitmap_andnot(topology->allowed_nodeset, topology->allowed_nodeset, droppednodeset);
+  }
 
   hwloc_bitmap_free(droppedcpuset);
   hwloc_bitmap_free(droppednodeset);
 
-  hwloc_connect_children(topology->levels[0][0]);
-  if (hwloc_connect_levels(topology) < 0)
+  if (hwloc_topology_reconnect(topology, 0) < 0)
     goto out;
-  topology->modified = 0;
 
+  /* some objects may have disappeared, we need to update distances objs arrays */
+  hwloc_internal_distances_invalidate_cached_objs(topology);
+
+  hwloc_filter_levels_keep_structure(topology);
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
   propagate_total_memory(topology->levels[0][0]);
-  hwloc_distances_restrict(topology, flags);
-  hwloc_distances_finalize_os(topology);
-  hwloc_distances_finalize_logical(topology);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
   return 0;
 
  out:
   /* unrecoverable failure, re-init the topology */
    hwloc_topology_clear(topology);
-   hwloc_distances_destroy(topology);
    hwloc_topology_setup_defaults(topology);
    return -1;
 }
 
+int
+hwloc_topology_allow(struct hwloc_topology *topology,
+		     hwloc_const_cpuset_t cpuset, hwloc_const_nodeset_t nodeset,
+		     unsigned long flags)
+{
+  if (!topology->is_loaded)
+    goto einval;
+
+  if (topology->adopted_shmem_addr) {
+    errno = EPERM;
+    goto error;
+  }
+
+  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED))
+    goto einval;
+
+  if (flags & ~(HWLOC_ALLOW_FLAG_ALL|HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS|HWLOC_ALLOW_FLAG_CUSTOM))
+    goto einval;
+
+  switch (flags) {
+  case HWLOC_ALLOW_FLAG_ALL: {
+    if (cpuset || nodeset)
+      goto einval;
+    hwloc_bitmap_copy(topology->allowed_cpuset, hwloc_get_root_obj(topology)->complete_cpuset);
+    hwloc_bitmap_copy(topology->allowed_nodeset, hwloc_get_root_obj(topology)->complete_nodeset);
+    break;
+  }
+  case HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS: {
+    if (cpuset || nodeset)
+      goto einval;
+    if (!topology->is_thissystem)
+      goto einval;
+    if (!topology->binding_hooks.get_allowed_resources) {
+      errno = ENOSYS;
+      goto error;
+    }
+    topology->binding_hooks.get_allowed_resources(topology);
+    break;
+  }
+  case HWLOC_ALLOW_FLAG_CUSTOM: {
+    if (cpuset) {
+      /* keep the intersection with the full topology cpuset, if not empty */
+      if (!hwloc_bitmap_intersects(hwloc_get_root_obj(topology)->cpuset, cpuset))
+	goto einval;
+      hwloc_bitmap_and(topology->allowed_cpuset, hwloc_get_root_obj(topology)->cpuset, cpuset);
+    }
+    if (nodeset) {
+      /* keep the intersection with the full topology nodeset, if not empty */
+      if (!hwloc_bitmap_intersects(hwloc_get_root_obj(topology)->nodeset, nodeset))
+	goto einval;
+      hwloc_bitmap_and(topology->allowed_nodeset, hwloc_get_root_obj(topology)->nodeset, nodeset);
+    }
+    break;
+  }
+  default:
+    goto einval;
+  }
+
+  return 0;
+
+ einval:
+  errno = EINVAL;
+ error:
+  return -1;
+}
+
 int
 hwloc_topology_is_thissystem(struct hwloc_topology *topology)
 {
   return topology->is_thissystem;
 }
 
-unsigned
+int
 hwloc_topology_get_depth(struct hwloc_topology *topology)
 {
-  return topology->nb_levels;
+  return (int) topology->nb_levels;
 }
 
 const struct hwloc_topology_support *
@@ -3088,17 +4299,83 @@ void * hwloc_topology_get_userdata(struct hwloc_topology * topology)
   return topology->userdata;
 }
 
+hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_cpuset;
+}
+
+hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->cpuset;
+}
+
+hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
+{
+  return topology->allowed_cpuset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_nodeset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->nodeset;
+}
+
+hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
+{
+  return topology->allowed_nodeset;
+}
+
+
 /****************
  * Debug Checks *
  ****************/
 
+#ifndef NDEBUG /* assert only enabled if !NDEBUG */
+
+static void
+hwloc__check_child_siblings(hwloc_obj_t parent, hwloc_obj_t *array,
+			    unsigned arity, unsigned i,
+			    hwloc_obj_t child, hwloc_obj_t prev)
+{
+  assert(child->parent == parent);
+
+  assert(child->sibling_rank == i);
+  if (array)
+    assert(child == array[i]);
+
+  if (prev)
+    assert(prev->next_sibling == child);
+  assert(child->prev_sibling == prev);
+
+  if (!i)
+    assert(child->prev_sibling == NULL);
+  else
+    assert(child->prev_sibling != NULL);
+
+  if (i == arity-1)
+    assert(child->next_sibling == NULL);
+  else
+    assert(child->next_sibling != NULL);
+}
+
 static void
-hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj);
+hwloc__check_object(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t obj);
 
 /* check children between a parent object */
 static void
-hwloc__check_children(hwloc_topology_t topology, hwloc_obj_t parent)
+hwloc__check_normal_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
 {
+  hwloc_obj_t child, prev;
   unsigned j;
 
   if (!parent->arity) {
@@ -3114,91 +4391,118 @@ hwloc__check_children(hwloc_topology_t topology, hwloc_obj_t parent)
   assert(parent->last_child);
 
   /* sibling checks */
-  for(j=0; j<parent->arity; j++) {
-    hwloc_obj_t child = parent->children[j];
-    assert(child->parent == parent);
-    assert(child->sibling_rank == j);
-    if (j)
-      assert(child->prev_sibling == parent->children[j-1]);
-    else
-      assert(!child->prev_sibling);
-    if (j == parent->arity-1)
-      assert(!child->next_sibling);
-    else
-      assert(child->next_sibling == parent->children[j+1]);
-    if (!hwloc_obj_type_is_io(child->type))
-      assert(child->depth > parent->depth);
+  for(prev = NULL, child = parent->first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* normal child */
+    assert(hwloc__obj_type_is_normal(child->type));
+    /* check depth */
+    assert(child->depth > parent->depth);
+    /* check siblings */
+    hwloc__check_child_siblings(parent, parent->children, parent->arity, j, child, prev);
     /* recurse */
-    hwloc__check_object(topology, child);
+    hwloc__check_object(topology, gp_indexes, child);
   }
+  /* check arity */
+  assert(j == parent->arity);
+
   assert(parent->first_child == parent->children[0]);
   assert(parent->last_child == parent->children[parent->arity-1]);
 
-  /* we already checked in the caller that objects have either all sets or none */
+  /* no normal children below a PU */
+  if (parent->type == HWLOC_OBJ_PU)
+    assert(!parent->arity);
+}
 
-  {
-    /* check that parent->cpuset == exclusive OR of children
-     * (can be wrong for complete_cpuset since disallowed/offline/unknown PUs can be removed)
-     */
-    hwloc_bitmap_t remaining_parent_cpuset = hwloc_bitmap_dup(parent->cpuset);
-    hwloc_bitmap_t remaining_parent_nodeset = hwloc_bitmap_dup(parent->nodeset);
-    for(j=0; j<parent->arity; j++) {
-      if (!parent->children[j]->cpuset)
-	continue;
-      /* check that child cpuset is included in the reminder of the parent */
-      assert(hwloc_bitmap_isincluded(parent->children[j]->cpuset, remaining_parent_cpuset));
-      hwloc_bitmap_andnot(remaining_parent_cpuset, remaining_parent_cpuset, parent->children[j]->cpuset);
-      /* check that child cpuset is included in the parent (multiple children may have the same nodeset when we're below a NUMA node) */
-      assert(hwloc_bitmap_isincluded(parent->children[j]->nodeset, parent->nodeset));
-      hwloc_bitmap_andnot(remaining_parent_nodeset, remaining_parent_nodeset, parent->children[j]->nodeset);
+static void
+hwloc__check_children_cpusets(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  /* we already checked in the caller that objects have either all sets or none */
+  hwloc_obj_t child;
+  int prev_first, prev_empty;
+
+  if (obj->type == HWLOC_OBJ_PU) {
+    /* PU cpuset is just itself, with no normal children */
+    assert(hwloc_bitmap_weight(obj->cpuset) == 1);
+    assert(hwloc_bitmap_first(obj->cpuset) == (int) obj->os_index);
+    assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
+    assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED)) {
+      assert(hwloc_bitmap_isset(topology->allowed_cpuset, (int) obj->os_index));
     }
-
-    if (parent->type == HWLOC_OBJ_PU) {
-      /* if parent is a PU (with Misc children for instance),
-       * its os_index bit may remain in cpuset. */
-      assert(hwloc_bitmap_weight(remaining_parent_cpuset) == 1);
-      assert(hwloc_bitmap_first(remaining_parent_cpuset) == (int)parent->os_index);
-    } else {
-      /* nothing remains */
-      assert(hwloc_bitmap_iszero(remaining_parent_cpuset));
+    assert(!obj->arity);
+  } else if (hwloc__obj_type_is_memory(obj->type)) {
+    /* memory object cpuset is equal to its parent */
+    assert(hwloc_bitmap_isequal(obj->parent->cpuset, obj->cpuset));
+    assert(!obj->arity);
+  } else if (!hwloc__obj_type_is_special(obj->type)) {
+    hwloc_bitmap_t set;
+    /* other obj cpuset is an exclusive OR of normal children, except for PUs */
+    set = hwloc_bitmap_alloc();
+    for_each_child(child, obj) {
+      assert(!hwloc_bitmap_intersects(set, child->cpuset));
+      hwloc_bitmap_or(set, set, child->cpuset);
     }
-    hwloc_bitmap_free(remaining_parent_cpuset);
+    assert(hwloc_bitmap_isequal(set, obj->cpuset));
+    hwloc_bitmap_free(set);
+  }
 
-    if (parent->type == HWLOC_OBJ_NUMANODE)
-      /* if parent is a NUMA node, its os_index bit may remain.
-       * or it could already have been removed by a child. */
-      hwloc_bitmap_clr(remaining_parent_nodeset, parent->os_index);
-    if (parent->type == HWLOC_OBJ_PU) {
-      /* if parent is a PU (with Misc children for instance),
-       * one bit may remain in nodeset. */
-      assert(hwloc_bitmap_weight(remaining_parent_nodeset) == 1);
+  /* check that memory children have same cpuset */
+  for_each_memory_child(child, obj)
+    assert(hwloc_bitmap_isequal(obj->cpuset, child->cpuset));
+
+  /* check that children complete_cpusets are properly ordered, empty ones may be anywhere
+   * (can be wrong for main cpuset since removed PUs can break the ordering).
+   */
+  prev_first = -1; /* -1 works fine with first comparisons below */
+  prev_empty = 0; /* no empty cpuset in previous children */
+  for_each_child(child, obj) {
+    int first = hwloc_bitmap_first(child->complete_cpuset);
+    if (first >= 0) {
+      assert(!prev_empty); /* no objects with CPU after objects without CPU */
+      assert(prev_first < first);
     } else {
-      /* nothing remains */
-      assert(hwloc_bitmap_iszero(remaining_parent_nodeset));
+      prev_empty = 1;
     }
-    hwloc_bitmap_free(remaining_parent_nodeset);
+    prev_first = first;
   }
+}
 
-  /* check that children complete_cpuset are properly ordered, empty ones may be anywhere
-   * (can be wrong for main cpuset since removed PUs can break the ordering).
-   */
-  {
-    int firstchild;
-    int prev_firstchild = -1; /* -1 works fine with first comparisons below */
-    for(j=0; j<parent->arity; j++) {
-      if (!parent->children[j]->complete_cpuset
-	  || hwloc_bitmap_iszero(parent->children[j]->complete_cpuset))
-	continue;
+static void
+hwloc__check_memory_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
 
-      firstchild = hwloc_bitmap_first(parent->children[j]->complete_cpuset);
-      assert(prev_firstchild < firstchild);
-      prev_firstchild = firstchild;
-    }
+  if (!parent->memory_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->memory_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->memory_first_child);
+
+  for(prev = NULL, child = parent->memory_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    assert(hwloc__obj_type_is_memory(child->type));
+    /* check siblings */
+    hwloc__check_child_siblings(parent, NULL, parent->memory_arity, j, child, prev);
+    /* only Memory and Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->io_first_child);
+    hwloc__check_object(topology, gp_indexes, child);
   }
+  /* check arity */
+  assert(j == parent->memory_arity);
+
+  /* no memory children below a NUMA node */
+  if (parent->type == HWLOC_OBJ_NUMANODE)
+    assert(!parent->memory_arity);
 }
 
 static void
-hwloc__check_io_children(hwloc_topology_t topology, hwloc_obj_t parent)
+hwloc__check_io_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
 {
   unsigned j;
   hwloc_obj_t child, prev;
@@ -3215,27 +4519,20 @@ hwloc__check_io_children(hwloc_topology_t topology, hwloc_obj_t parent)
       child;
       prev = child, child = child->next_sibling, j++) {
     /* all children must be I/O */
-    assert(hwloc_obj_type_is_io(child->type));
-
+    assert(hwloc__obj_type_is_io(child->type));
     /* check siblings */
-    assert(child->parent == parent);
-    assert(child->sibling_rank == j);
-    if (prev)
-      assert(prev->next_sibling == child);
-    assert(child->prev_sibling == prev);
-    if (j == parent->io_arity-1)
-      assert(child->next_sibling == NULL);
-
+    hwloc__check_child_siblings(parent, NULL, parent->io_arity, j, child, prev);
     /* only I/O and Misc children, recurse */
     assert(!child->first_child);
-    hwloc__check_object(topology, child);
+    assert(!child->memory_first_child);
+    hwloc__check_object(topology, gp_indexes, child);
   }
   /* check arity */
   assert(j == parent->io_arity);
 }
 
 static void
-hwloc__check_misc_children(hwloc_topology_t topology, hwloc_obj_t parent)
+hwloc__check_misc_children(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t parent)
 {
   unsigned j;
   hwloc_obj_t child, prev;
@@ -3253,72 +4550,155 @@ hwloc__check_misc_children(hwloc_topology_t topology, hwloc_obj_t parent)
       prev = child, child = child->next_sibling, j++) {
     /* all children must be Misc */
     assert(child->type == HWLOC_OBJ_MISC);
-
     /* check siblings */
-    assert(child->parent == parent);
-    assert(child->sibling_rank == j);
-    if (prev)
-      assert(prev->next_sibling == child);
-    assert(child->prev_sibling == prev);
-    if (j == parent->misc_arity-1)
-      assert(child->next_sibling == NULL);
-
+    hwloc__check_child_siblings(parent, NULL, parent->misc_arity, j, child, prev);
     /* only Misc children, recurse */
     assert(!child->first_child);
+    assert(!child->memory_first_child);
     assert(!child->io_first_child);
-    hwloc__check_object(topology, child);
+    hwloc__check_object(topology, gp_indexes, child);
   }
   /* check arity */
   assert(j == parent->misc_arity);
 }
 
 static void
-hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj)
+hwloc__check_object(hwloc_topology_t topology, hwloc_bitmap_t gp_indexes, hwloc_obj_t obj)
 {
+  assert(!hwloc_bitmap_isset(gp_indexes, obj->gp_index));
+  hwloc_bitmap_set(gp_indexes, obj->gp_index);
+
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  assert((unsigned) obj->type < HWLOC_OBJ_TYPE_MAX);
+
+  assert(hwloc_filter_check_keep_object(topology, obj));
+
   /* check that sets and depth */
-  if (hwloc_obj_type_is_special(obj->type)) {
+  if (hwloc__obj_type_is_special(obj->type)) {
     assert(!obj->cpuset);
     if (obj->type == HWLOC_OBJ_BRIDGE)
-      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_BRIDGE);
+      assert(obj->depth == HWLOC_TYPE_DEPTH_BRIDGE);
     else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
-      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_PCI_DEVICE);
+      assert(obj->depth == HWLOC_TYPE_DEPTH_PCI_DEVICE);
     else if (obj->type == HWLOC_OBJ_OS_DEVICE)
-      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_OS_DEVICE);
+      assert(obj->depth == HWLOC_TYPE_DEPTH_OS_DEVICE);
     else if (obj->type == HWLOC_OBJ_MISC)
-      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_MISC);
+      assert(obj->depth == HWLOC_TYPE_DEPTH_MISC);
   } else {
     assert(obj->cpuset);
-    assert((int) obj->depth >= 0);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_NUMANODE);
+    else if (obj->type == HWLOC_OBJ_MEMCACHE)
+      assert(obj->depth == HWLOC_TYPE_DEPTH_MEMCACHE);
+    else
+      assert(obj->depth >= 0);
+  }
+
+  /* group depth cannot be -1 anymore in v2.0+ */
+  if (obj->type == HWLOC_OBJ_GROUP) {
+    assert(obj->attr->group.depth != (unsigned) -1);
   }
 
   /* there's other cpusets and nodesets if and only if there's a main cpuset */
   assert(!!obj->cpuset == !!obj->complete_cpuset);
-  assert(!!obj->cpuset == !!obj->allowed_cpuset);
   assert(!!obj->cpuset == !!obj->nodeset);
   assert(!!obj->nodeset == !!obj->complete_nodeset);
-  assert(!!obj->nodeset == !!obj->allowed_nodeset);
 
-  /* check that complete/allowed/inline sets are larger than the main sets */
+  /* check that complete/inline sets are larger than the main sets */
   if (obj->cpuset) {
     assert(hwloc_bitmap_isincluded(obj->cpuset, obj->complete_cpuset));
     assert(hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset));
-    if (topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) {
-      assert(hwloc_bitmap_isincluded(obj->allowed_cpuset, obj->cpuset));
-      assert(hwloc_bitmap_isincluded(obj->allowed_nodeset, obj->nodeset));
-    } else {
-      assert(hwloc_bitmap_isequal(obj->allowed_cpuset, obj->cpuset));
-      assert(hwloc_bitmap_isequal(obj->allowed_nodeset, obj->nodeset));
-    }
+  }
+
+  /* check cache type/depth vs type */
+  if (hwloc__obj_type_is_cache(obj->type)) {
+    if (hwloc__obj_type_is_icache(obj->type))
+      assert(obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION);
+    else if (hwloc__obj_type_is_dcache(obj->type))
+      assert(obj->attr->cache.type == HWLOC_OBJ_CACHE_DATA
+	     || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED);
+    else
+      assert(0);
+    assert(hwloc_cache_type_by_depth_type(obj->attr->cache.depth, obj->attr->cache.type) == obj->type);
   }
 
   /* check children */
-  hwloc__check_children(topology, obj);
-  hwloc__check_io_children(topology, obj);
-  hwloc__check_misc_children(topology, obj);
+  hwloc__check_normal_children(topology, gp_indexes, obj);
+  hwloc__check_memory_children(topology, gp_indexes, obj);
+  hwloc__check_io_children(topology, gp_indexes, obj);
+  hwloc__check_misc_children(topology, gp_indexes, obj);
+  hwloc__check_children_cpusets(topology, obj);
+  /* nodesets are checked during another recursion with state below */
+}
+
+static void
+hwloc__check_nodesets(hwloc_topology_t topology, hwloc_obj_t obj, hwloc_bitmap_t parentset)
+{
+  hwloc_obj_t child;
+  int prev_first;
+
+  if (obj->type == HWLOC_OBJ_NUMANODE) {
+    /* NUMANODE nodeset is just itself, with no memory/normal children */
+    assert(hwloc_bitmap_weight(obj->nodeset) == 1);
+    assert(hwloc_bitmap_first(obj->nodeset) == (int) obj->os_index);
+    assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+    assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED)) {
+      assert(hwloc_bitmap_isset(topology->allowed_nodeset, (int) obj->os_index));
+    }
+    assert(!obj->arity);
+    assert(!obj->memory_arity);
+    assert(hwloc_bitmap_isincluded(obj->nodeset, parentset));
+  } else {
+    hwloc_bitmap_t myset;
+    hwloc_bitmap_t childset;
+
+    /* the local nodeset is an exclusive OR of memory children */
+    myset = hwloc_bitmap_alloc();
+    for_each_memory_child(child, obj) {
+      assert(!hwloc_bitmap_intersects(myset, child->nodeset));
+      hwloc_bitmap_or(myset, myset, child->nodeset);
+    }
+    /* the local nodeset cannot intersect with parents' local nodeset */
+    assert(!hwloc_bitmap_intersects(myset, parentset));
+    hwloc_bitmap_or(parentset, parentset, myset);
+    hwloc_bitmap_free(myset);
+    /* parentset now contains parent+local contribution */
+
+    /* for each children, recurse to check/get its contribution */
+    childset = hwloc_bitmap_alloc();
+    for_each_child(child, obj) {
+      hwloc_bitmap_t set = hwloc_bitmap_dup(parentset); /* don't touch parentset, we don't want to propagate the first child contribution to other children */
+      hwloc__check_nodesets(topology, child, set);
+      /* extract this child contribution */
+      hwloc_bitmap_andnot(set, set, parentset);
+      /* save it */
+      assert(!hwloc_bitmap_intersects(childset, set));
+      hwloc_bitmap_or(childset, childset, set);
+      hwloc_bitmap_free(set);
+    }
+    /* combine child contribution into parentset */
+    assert(!hwloc_bitmap_intersects(parentset, childset));
+    hwloc_bitmap_or(parentset, parentset, childset);
+    hwloc_bitmap_free(childset);
+    /* now check that our nodeset is combination of parent, local and children */
+    assert(hwloc_bitmap_isequal(obj->nodeset, parentset));
+  }
+
+  /* check that children complete_nodesets are properly ordered, empty ones may be anywhere
+   * (can be wrong for main nodeset since removed PUs can break the ordering).
+   */
+  prev_first = -1; /* -1 works fine with first comparisons below */
+  for_each_memory_child(child, obj) {
+    int first = hwloc_bitmap_first(child->complete_nodeset);
+    assert(prev_first < first);
+    prev_first = first;
+  }
 }
 
 static void
-hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
+hwloc__check_level(struct hwloc_topology *topology, int depth,
+		   hwloc_obj_t first, hwloc_obj_t last)
 {
   unsigned width = hwloc_get_nbobjs_by_depth(topology, depth);
   struct hwloc_obj *prev = NULL;
@@ -3334,16 +4714,12 @@ hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
     assert(obj->logical_index == j);
     /* check that all objects in the level have the same type */
     if (prev) {
-      assert(hwloc_type_cmp(obj, prev) == HWLOC_TYPE_EQUAL);
+      assert(hwloc_type_cmp(obj, prev) == HWLOC_OBJ_EQUAL);
       assert(prev->next_cousin == obj);
     }
     assert(obj->prev_cousin == prev);
 
     /* check that PUs and NUMA nodes have correct cpuset/nodeset */
-    if (obj->type == HWLOC_OBJ_PU) {
-      assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
-      assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
-    }
     if (obj->type == HWLOC_OBJ_NUMANODE) {
       assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
       assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
@@ -3360,7 +4736,7 @@ hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
     assert(!obj->prev_cousin);
     /* check type */
     assert(hwloc_get_depth_type(topology, depth) == obj->type);
-    assert(depth == (unsigned) hwloc_get_type_depth(topology, obj->type)
+    assert(depth == hwloc_get_type_depth(topology, obj->type)
 	   || HWLOC_TYPE_DEPTH_MULTIPLE == hwloc_get_type_depth(topology, obj->type));
     /* check last object of the level */
     obj = hwloc_get_obj_by_depth(topology, depth, width-1);
@@ -3368,6 +4744,14 @@ hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
     assert(!obj->next_cousin);
   }
 
+  if (depth < 0) {
+    assert(first == hwloc_get_obj_by_depth(topology, depth, 0));
+    assert(last == hwloc_get_obj_by_depth(topology, depth, width-1));
+  } else {
+    assert(!first);
+    assert(!last);
+  }
+
   /* check last+1 object of the level */
   obj = hwloc_get_obj_by_depth(topology, depth, width);
   assert(!obj);
@@ -3378,42 +4762,113 @@ void
 hwloc_topology_check(struct hwloc_topology *topology)
 {
   struct hwloc_obj *obj;
+  hwloc_bitmap_t gp_indexes, set;
   hwloc_obj_type_t type;
-  unsigned i, j, depth;
+  unsigned i;
+  int j, depth;
+
+  /* make sure we can use ranges to check types */
+
+  /* hwloc__obj_type_is_{,d,i}cache() want cache types to be ordered like this */
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L2CACHE == HWLOC_OBJ_L1CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L3CACHE == HWLOC_OBJ_L2CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L4CACHE == HWLOC_OBJ_L3CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L5CACHE == HWLOC_OBJ_L4CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L1ICACHE == HWLOC_OBJ_L5CACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L2ICACHE == HWLOC_OBJ_L1ICACHE + 1);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_L3ICACHE == HWLOC_OBJ_L2ICACHE + 1);
+
+  /* hwloc__obj_type_is_normal(), hwloc__obj_type_is_memory(), hwloc__obj_type_is_io(), hwloc__obj_type_is_special()
+   * and hwloc_reset_normal_type_depths()
+   * want special types to be ordered like this, after all normal types.
+   */
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_NUMANODE   + 1 == HWLOC_OBJ_BRIDGE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE     + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_OS_DEVICE  + 1 == HWLOC_OBJ_MISC);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC       + 1 == HWLOC_OBJ_MEMCACHE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MEMCACHE   + 1 == HWLOC_OBJ_DIE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_DIE        + 1 == HWLOC_OBJ_TYPE_MAX);
+
+  /* make sure order and priority arrays have the right size */
+  HWLOC_BUILD_ASSERT(sizeof(obj_type_order)/sizeof(*obj_type_order) == HWLOC_OBJ_TYPE_MAX);
+  HWLOC_BUILD_ASSERT(sizeof(obj_order_type)/sizeof(*obj_order_type) == HWLOC_OBJ_TYPE_MAX);
+  HWLOC_BUILD_ASSERT(sizeof(obj_type_priority)/sizeof(*obj_type_priority) == HWLOC_OBJ_TYPE_MAX);
+
+  /* make sure group are not entirely ignored */
+  assert(topology->type_filter[HWLOC_OBJ_GROUP] != HWLOC_TYPE_FILTER_KEEP_ALL);
+
+  /* make sure order arrays are coherent */
+  for(type=HWLOC_OBJ_TYPE_MIN; type<HWLOC_OBJ_TYPE_MAX; type++)
+    assert(obj_order_type[obj_type_order[type]] == type);
+  for(i=HWLOC_OBJ_TYPE_MIN; i<HWLOC_OBJ_TYPE_MAX; i++)
+    assert(obj_type_order[obj_order_type[i]] == i);
 
   depth = hwloc_topology_get_depth(topology);
 
   assert(!topology->modified);
 
-  /* check type orders */
-  for (type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
-    assert(hwloc_get_order_type(hwloc_get_type_order(type)) == type);
-  }
-  for (i = hwloc_get_type_order(HWLOC_OBJ_SYSTEM);
-       i <= hwloc_get_type_order(HWLOC_OBJ_CORE); i++) {
-    assert(i == hwloc_get_type_order(hwloc_get_order_type(i)));
-  }
+  /* check that first level is Machine.
+   * Root object cannot be ignored. And Machine can only be merged into PU,
+   * but there must be a NUMA node below Machine, and it cannot be below PU.
+   */
+  assert(hwloc_get_depth_type(topology, 0) == HWLOC_OBJ_MACHINE);
 
-  /* check that last level is PU */
+  /* check that last level is PU and that it doesn't have memory */
   assert(hwloc_get_depth_type(topology, depth-1) == HWLOC_OBJ_PU);
   assert(hwloc_get_nbobjs_by_depth(topology, depth-1) > 0);
-  for(j=0; j<hwloc_get_nbobjs_by_depth(topology, depth-1); j++) {
-    obj = hwloc_get_obj_by_depth(topology, depth-1, j);
+  for(i=0; i<hwloc_get_nbobjs_by_depth(topology, depth-1); i++) {
+    obj = hwloc_get_obj_by_depth(topology, depth-1, i);
     assert(obj);
     assert(obj->type == HWLOC_OBJ_PU);
+    assert(!obj->memory_first_child);
+  }
+  /* check that other levels are not PU or Machine */
+  for(j=1; j<depth-1; j++) {
+    assert(hwloc_get_depth_type(topology, j) != HWLOC_OBJ_PU);
+    assert(hwloc_get_depth_type(topology, j) != HWLOC_OBJ_MACHINE);
+  }
+
+  /* check normal levels */
+  for(j=0; j<depth; j++) {
+    int d;
+    type = hwloc_get_depth_type(topology, j);
+    assert(type != HWLOC_OBJ_NUMANODE);
+    assert(type != HWLOC_OBJ_MEMCACHE);
+    assert(type != HWLOC_OBJ_PCI_DEVICE);
+    assert(type != HWLOC_OBJ_BRIDGE);
+    assert(type != HWLOC_OBJ_OS_DEVICE);
+    assert(type != HWLOC_OBJ_MISC);
+    d = hwloc_get_type_depth(topology, type);
+    assert(d == j || d == HWLOC_TYPE_DEPTH_MULTIPLE);
   }
-  /* check that other levels are not PU */
-  for(i=1; i<depth-1; i++)
-    assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_PU);
 
-  /* check that we have a NUMA level */
-  j = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
-  assert(j < hwloc_topology_get_depth(topology));
-  assert(hwloc_get_depth_type(topology, j) == HWLOC_OBJ_NUMANODE);
-  /* check that other levels are not NUMA */
-  for(i=0; i<depth-1; i++)
-    if (i != j)
-      assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_NUMANODE);
+  /* check type depths, even if there's no such level */
+  for(type=HWLOC_OBJ_TYPE_MIN; type<HWLOC_OBJ_TYPE_MAX; type++) {
+    int d;
+    d = hwloc_get_type_depth(topology, type);
+    if (type == HWLOC_OBJ_NUMANODE) {
+      assert(d == HWLOC_TYPE_DEPTH_NUMANODE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_NUMANODE);
+    } else if (type == HWLOC_OBJ_MEMCACHE) {
+      assert(d == HWLOC_TYPE_DEPTH_MEMCACHE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_MEMCACHE);
+    } else if (type == HWLOC_OBJ_BRIDGE) {
+      assert(d == HWLOC_TYPE_DEPTH_BRIDGE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_BRIDGE);
+    } else if (type == HWLOC_OBJ_PCI_DEVICE) {
+      assert(d == HWLOC_TYPE_DEPTH_PCI_DEVICE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_PCI_DEVICE);
+    } else if (type == HWLOC_OBJ_OS_DEVICE) {
+      assert(d == HWLOC_TYPE_DEPTH_OS_DEVICE);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_OS_DEVICE);
+    } else if (type == HWLOC_OBJ_MISC) {
+      assert(d == HWLOC_TYPE_DEPTH_MISC);
+      assert(hwloc_get_depth_type(topology, d) == HWLOC_OBJ_MISC);
+    } else {
+      assert(d >=0 || d == HWLOC_TYPE_DEPTH_UNKNOWN || d == HWLOC_TYPE_DEPTH_MULTIPLE);
+    }
+  }
 
   /* top-level specific checks */
   assert(hwloc_get_nbobjs_by_depth(topology, 0) == 1);
@@ -3423,14 +4878,37 @@ hwloc_topology_check(struct hwloc_topology *topology)
   assert(obj->cpuset);
   assert(!obj->depth);
 
+  /* check that allowed sets are larger than the main sets */
+  if (topology->flags & HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED) {
+    assert(hwloc_bitmap_isincluded(topology->allowed_cpuset, obj->cpuset));
+    assert(hwloc_bitmap_isincluded(topology->allowed_nodeset, obj->nodeset));
+  } else {
+    assert(hwloc_bitmap_isequal(topology->allowed_cpuset, obj->cpuset));
+    assert(hwloc_bitmap_isequal(topology->allowed_nodeset, obj->nodeset));
+  }
+
   /* check each level */
-  for(i=0; i<depth; i++)
-    hwloc__check_level(topology, i);
-  hwloc__check_level(topology, HWLOC_OBJ_BRIDGE);
-  hwloc__check_level(topology, HWLOC_OBJ_PCI_DEVICE);
-  hwloc__check_level(topology, HWLOC_OBJ_OS_DEVICE);
-  hwloc__check_level(topology, HWLOC_OBJ_MISC);
+  for(j=0; j<depth; j++)
+    hwloc__check_level(topology, j, NULL, NULL);
+  for(j=0; j<HWLOC_NR_SLEVELS; j++)
+    hwloc__check_level(topology, HWLOC_SLEVEL_TO_DEPTH(j), topology->slevels[j].first, topology->slevels[j].last);
 
   /* recurse and check the tree of children, and type-specific checks */
-  hwloc__check_object(topology, obj);
+  gp_indexes = hwloc_bitmap_alloc(); /* TODO prealloc to topology->next_gp_index */
+  hwloc__check_object(topology, gp_indexes, obj);
+  hwloc_bitmap_free(gp_indexes);
+
+  /* recurse and check the nodesets of children */
+  set = hwloc_bitmap_alloc();
+  hwloc__check_nodesets(topology, obj, set);
+  hwloc_bitmap_free(set);
+}
+
+#else /* NDEBUG */
+
+void
+hwloc_topology_check(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
 }
+
+#endif /* NDEBUG */
diff --git a/ext/hwloc/hwloc/traversal.c b/ext/hwloc/hwloc/traversal.c
index f1e9ba786..0b744d787 100644
--- a/ext/hwloc/hwloc/traversal.c
+++ b/ext/hwloc/hwloc/traversal.c
@@ -1,16 +1,17 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2010 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <private/private.h>
-#include <private/misc.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "private/private.h"
+#include "private/misc.h"
+#include "private/debug.h"
+
 #ifdef HAVE_STRINGS_H
 #include <strings.h>
 #endif /* HAVE_STRINGS_H */
@@ -18,14 +19,20 @@
 int
 hwloc_get_type_depth (struct hwloc_topology *topology, hwloc_obj_type_t type)
 {
-  return topology->type_depth[type];
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_TYPE_MIN == 0);
+  if ((unsigned) type >= HWLOC_OBJ_TYPE_MAX)
+    return HWLOC_TYPE_DEPTH_UNKNOWN;
+  else
+    return topology->type_depth[type];
 }
 
 hwloc_obj_type_t
-hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth)
+hwloc_get_depth_type (hwloc_topology_t topology, int depth)
 {
-  if (depth >= topology->nb_levels)
+  if ((unsigned)depth >= topology->nb_levels)
     switch (depth) {
+    case HWLOC_TYPE_DEPTH_NUMANODE:
+      return HWLOC_OBJ_NUMANODE;
     case HWLOC_TYPE_DEPTH_BRIDGE:
       return HWLOC_OBJ_BRIDGE;
     case HWLOC_TYPE_DEPTH_PCI_DEVICE:
@@ -34,56 +41,107 @@ hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth)
       return HWLOC_OBJ_OS_DEVICE;
     case HWLOC_TYPE_DEPTH_MISC:
       return HWLOC_OBJ_MISC;
+    case HWLOC_TYPE_DEPTH_MEMCACHE:
+      return HWLOC_OBJ_MEMCACHE;
     default:
-      return (hwloc_obj_type_t) -1;
+      return HWLOC_OBJ_TYPE_NONE;
     }
   return topology->levels[depth][0]->type;
 }
 
+int
+hwloc_get_memory_parents_depth (hwloc_topology_t topology)
+{
+  int depth = HWLOC_TYPE_DEPTH_UNKNOWN;
+  /* memory leaves are always NUMA nodes for now, no need to check parents of other memory types */
+  hwloc_obj_t numa = hwloc_get_obj_by_depth(topology, HWLOC_TYPE_DEPTH_NUMANODE, 0);
+  assert(numa);
+  while (numa) {
+    hwloc_obj_t parent = numa->parent;
+    /* walk-up the memory hierarchy */
+    while (hwloc__obj_type_is_memory(parent->type))
+      parent = parent->parent;
+
+    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+      depth = parent->depth;
+    else if (depth != parent->depth)
+      return HWLOC_TYPE_DEPTH_MULTIPLE;
+
+    numa = numa->next_cousin;
+  }
+
+  assert(depth >= 0);
+  return depth;
+}
+
 unsigned
-hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, unsigned depth)
+hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, int depth)
 {
-  if (depth >= topology->nb_levels)
-    switch (depth) {
-    case HWLOC_TYPE_DEPTH_BRIDGE:
-      return topology->bridge_nbobjects;
-    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
-      return topology->pcidev_nbobjects;
-    case HWLOC_TYPE_DEPTH_OS_DEVICE:
-      return topology->osdev_nbobjects;
-    case HWLOC_TYPE_DEPTH_MISC:
-      return topology->misc_nbobjects;
-    default:
+  if ((unsigned)depth >= topology->nb_levels) {
+    unsigned l = HWLOC_SLEVEL_FROM_DEPTH(depth);
+    if (l < HWLOC_NR_SLEVELS)
+      return topology->slevels[l].nbobjs;
+    else
       return 0;
-    }
+  }
   return topology->level_nbobjects[depth];
 }
 
 struct hwloc_obj *
-hwloc_get_obj_by_depth (struct hwloc_topology *topology, unsigned depth, unsigned idx)
+hwloc_get_obj_by_depth (struct hwloc_topology *topology, int depth, unsigned idx)
 {
-  if (depth >= topology->nb_levels)
-    switch (depth) {
-    case HWLOC_TYPE_DEPTH_BRIDGE:
-      return idx < topology->bridge_nbobjects ? topology->bridge_level[idx] : NULL;
-    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
-      return idx < topology->pcidev_nbobjects ? topology->pcidev_level[idx] : NULL;
-    case HWLOC_TYPE_DEPTH_OS_DEVICE:
-      return idx < topology->osdev_nbobjects ? topology->osdev_level[idx] : NULL;
-    case HWLOC_TYPE_DEPTH_MISC:
-      return idx < topology->misc_nbobjects ? topology->misc_level[idx] : NULL;
-    default:
+  if ((unsigned)depth >= topology->nb_levels) {
+    unsigned l = HWLOC_SLEVEL_FROM_DEPTH(depth);
+    if (l < HWLOC_NR_SLEVELS)
+      return idx < topology->slevels[l].nbobjs ? topology->slevels[l].objs[idx] : NULL;
+    else
       return NULL;
-    }
+  }
   if (idx >= topology->level_nbobjects[depth])
     return NULL;
   return topology->levels[depth][idx];
 }
 
+int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_normal(type);
+}
+
+int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_memory(type);
+}
+
+int
+hwloc_obj_type_is_io(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_io(type);
+}
+
+int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_cache(type);
+}
+
+int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_dcache(type);
+}
+
+int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type)
+{
+  return hwloc__obj_type_is_icache(type);
+}
+
 unsigned hwloc_get_closest_objs (struct hwloc_topology *topology, struct hwloc_obj *src, struct hwloc_obj **objs, unsigned max)
 {
   struct hwloc_obj *parent, *nextparent, **src_objs;
-  int i,src_nbobjects;
+  unsigned i,src_nbobjects;
   unsigned stored = 0;
 
   if (!src->cpuset)
@@ -138,16 +196,15 @@ hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bi
   }
 
   for (i=0; i<current->arity; i++) {
-    hwloc_bitmap_t subset = hwloc_bitmap_dup(set);
+    hwloc_bitmap_t subset;
     int ret;
 
     /* split out the cpuset part corresponding to this child and see if there's anything to do */
-    hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
-    if (hwloc_bitmap_iszero(subset)) {
-      hwloc_bitmap_free(subset);
+    if (!hwloc_bitmap_intersects(set,current->children[i]->cpuset))
       continue;
-    }
 
+    subset = hwloc_bitmap_dup(set);
+    hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
     ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
     gotten += ret;
     hwloc_bitmap_free(subset);
@@ -180,13 +237,21 @@ hwloc_obj_type_string (hwloc_obj_type_t obj)
 {
   switch (obj)
     {
-    case HWLOC_OBJ_SYSTEM: return "System";
     case HWLOC_OBJ_MACHINE: return "Machine";
     case HWLOC_OBJ_MISC: return "Misc";
     case HWLOC_OBJ_GROUP: return "Group";
+    case HWLOC_OBJ_MEMCACHE: return "MemCache";
     case HWLOC_OBJ_NUMANODE: return "NUMANode";
     case HWLOC_OBJ_PACKAGE: return "Package";
-    case HWLOC_OBJ_CACHE: return "Cache";
+    case HWLOC_OBJ_DIE: return "Die";
+    case HWLOC_OBJ_L1CACHE: return "L1Cache";
+    case HWLOC_OBJ_L2CACHE: return "L2Cache";
+    case HWLOC_OBJ_L3CACHE: return "L3Cache";
+    case HWLOC_OBJ_L4CACHE: return "L4Cache";
+    case HWLOC_OBJ_L5CACHE: return "L5Cache";
+    case HWLOC_OBJ_L1ICACHE: return "L1iCache";
+    case HWLOC_OBJ_L2ICACHE: return "L2iCache";
+    case HWLOC_OBJ_L3ICACHE: return "L3iCache";
     case HWLOC_OBJ_CORE: return "Core";
     case HWLOC_OBJ_BRIDGE: return "Bridge";
     case HWLOC_OBJ_PCI_DEVICE: return "PCIDev";
@@ -196,256 +261,202 @@ hwloc_obj_type_string (hwloc_obj_type_t obj)
     }
 }
 
-hwloc_obj_type_t
-hwloc_obj_type_of_string (const char * string)
+/* Check if string matches the given type at least on minmatch chars.
+ * On success, return the address of where matching stop, either pointing to \0 or to a suffix (digits, colon, etc)
+ * On error, return NULL;
+ */
+static __hwloc_inline const char *
+hwloc__type_match(const char *string,
+		  const char *type, /* type must be lowercase */
+		  size_t minmatch)
 {
-  if (!strcasecmp(string, "System")) return HWLOC_OBJ_SYSTEM;
-  if (!strcasecmp(string, "Machine")) return HWLOC_OBJ_MACHINE;
-  if (!strcasecmp(string, "Misc")) return HWLOC_OBJ_MISC;
-  if (!strcasecmp(string, "Group")) return HWLOC_OBJ_GROUP;
-  if (!strcasecmp(string, "NUMANode") || !strcasecmp(string, "Node")) return HWLOC_OBJ_NUMANODE;
-  if (!strcasecmp(string, "Package") || !strcasecmp(string, "Socket") /* backward compat with v1.10 */) return HWLOC_OBJ_PACKAGE;
-  if (!strcasecmp(string, "Cache")) return HWLOC_OBJ_CACHE;
-  if (!strcasecmp(string, "Core")) return HWLOC_OBJ_CORE;
-  if (!strcasecmp(string, "PU")) return HWLOC_OBJ_PU;
-  if (!strcasecmp(string, "Bridge")) return HWLOC_OBJ_BRIDGE;
-  if (!strcasecmp(string, "PCIDev")) return HWLOC_OBJ_PCI_DEVICE;
-  if (!strcasecmp(string, "OSDev")) return HWLOC_OBJ_OS_DEVICE;
-  return (hwloc_obj_type_t) -1;
+  const char *s, *t;
+  unsigned i;
+  for(i=0, s=string, t=type; ; i++, s++, t++) {
+    if (!*s) {
+      /* string ends before type */
+      if (i<minmatch)
+	return NULL;
+      else
+	return s;
+    }
+    if (*s != *t && *s != *t + 'A' - 'a') {
+      /* string is different */
+      if ((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z') || *s == '-')
+	/* valid character that doesn't match */
+	return NULL;
+      /* invalid character, we reached the end of the type namein string, stop matching here */
+      if (i<minmatch)
+	return NULL;
+      else
+	return s;
+    }
+  }
+
+  return NULL;
 }
 
 int
-hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+hwloc_type_sscanf(const char *string, hwloc_obj_type_t *typep,
+		  union hwloc_obj_attr_u *attrp, size_t attrsize)
 {
   hwloc_obj_type_t type = (hwloc_obj_type_t) -1;
-  int depthattr = -1;
+  unsigned depthattr = (unsigned) -1;
   hwloc_obj_cache_type_t cachetypeattr = (hwloc_obj_cache_type_t) -1; /* unspecified */
+  hwloc_obj_bridge_type_t ubtype = (hwloc_obj_bridge_type_t) -1;
+  hwloc_obj_osdev_type_t ostype = (hwloc_obj_osdev_type_t) -1;
   char *end;
 
-  /* types without depthattr */
-  if (!hwloc_strncasecmp(string, "system", 2)) {
-    type = HWLOC_OBJ_SYSTEM;
-  } else if (!hwloc_strncasecmp(string, "machine", 2)) {
+  /* Never match the ending \0 since we want to match things like core:2 too.
+   * We'll only compare the beginning substring only made of letters and dash.
+   */
+
+  /* types without a custom depth */
+
+  /* osdev subtype first to avoid conflicts coproc/core etc */
+  if (hwloc__type_match(string, "osdev", 2)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+  } else if (hwloc__type_match(string, "block", 4)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_BLOCK;
+  } else if (hwloc__type_match(string, "network", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_NETWORK;
+  } else if (hwloc__type_match(string, "openfabrics", 7)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_OPENFABRICS;
+  } else if (hwloc__type_match(string, "dma", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_DMA;
+  } else if (hwloc__type_match(string, "gpu", 3)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_GPU;
+  } else if (hwloc__type_match(string, "coproc", 5)
+	     || hwloc__type_match(string, "co-processor", 6)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+    ostype = HWLOC_OBJ_OSDEV_COPROC;
+
+  } else if (hwloc__type_match(string, "machine", 2)) {
     type = HWLOC_OBJ_MACHINE;
-  } else if (!hwloc_strncasecmp(string, "node", 1)
-	     || !hwloc_strncasecmp(string, "numa", 1)) { /* matches node and numanode */
+  } else if (hwloc__type_match(string, "numanode", 2)
+	     || hwloc__type_match(string, "node", 2)) { /* for convenience */
     type = HWLOC_OBJ_NUMANODE;
-  } else if (!hwloc_strncasecmp(string, "package", 2)
-	     || !hwloc_strncasecmp(string, "socket", 2)) { /* backward compat with v1.10 */
+  } else if (hwloc__type_match(string, "memcache", 5)
+	     || hwloc__type_match(string, "memory-side cache", 8)) {
+    type = HWLOC_OBJ_MEMCACHE;
+  } else if (hwloc__type_match(string, "package", 2)
+	     || hwloc__type_match(string, "socket", 2)) { /* backward compat with v1.10 */
     type = HWLOC_OBJ_PACKAGE;
-  } else if (!hwloc_strncasecmp(string, "core", 2)) {
+  } else if (hwloc__type_match(string, "die", 2)) {
+    type = HWLOC_OBJ_DIE;
+  } else if (hwloc__type_match(string, "core", 2)) {
     type = HWLOC_OBJ_CORE;
-  } else if (!hwloc_strncasecmp(string, "pu", 2)) {
+  } else if (hwloc__type_match(string, "pu", 2)) {
     type = HWLOC_OBJ_PU;
-  } else if (!hwloc_strncasecmp(string, "misc", 2)) {
+  } else if (hwloc__type_match(string, "misc", 4)) {
     type = HWLOC_OBJ_MISC;
-  } else if (!hwloc_strncasecmp(string, "bridge", 2)) {
+
+  } else if (hwloc__type_match(string, "bridge", 4)) {
     type = HWLOC_OBJ_BRIDGE;
-  } else if (!hwloc_strncasecmp(string, "pci", 2)) {
+  } else if (hwloc__type_match(string, "hostbridge", 6)) {
+    type = HWLOC_OBJ_BRIDGE;
+    ubtype = HWLOC_OBJ_BRIDGE_HOST;
+  } else if (hwloc__type_match(string, "pcibridge", 5)) {
+    type = HWLOC_OBJ_BRIDGE;
+    ubtype = HWLOC_OBJ_BRIDGE_PCI;
+
+  } else if (hwloc__type_match(string, "pcidev", 3)) {
     type = HWLOC_OBJ_PCI_DEVICE;
-  } else if (!hwloc_strncasecmp(string, "os", 2)) {
-    type = HWLOC_OBJ_OS_DEVICE;
 
   /* types with depthattr */
-  } else if (!hwloc_strncasecmp(string, "cache", 2)) {
-    type = HWLOC_OBJ_CACHE;
-
   } else if ((string[0] == 'l' || string[0] == 'L') && string[1] >= '0' && string[1] <= '9') {
-    type = HWLOC_OBJ_CACHE;
+    char *suffix;
     depthattr = strtol(string+1, &end, 10);
-    if (*end == 'd') {
-      cachetypeattr = HWLOC_OBJ_CACHE_DATA;
-    } else if (*end == 'i') {
-      cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
-    } else if (*end == 'u') {
-      cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+    if (*end == 'i' || *end == 'I') {
+      if (depthattr >= 1 && depthattr <= 3) {
+	type = HWLOC_OBJ_L1ICACHE + depthattr-1;
+	cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
+	suffix = end+1;
+      } else
+	return -1;
+    } else {
+      if (depthattr >= 1 && depthattr <= 5) {
+	type = HWLOC_OBJ_L1CACHE + depthattr-1;
+	if (*end == 'd' || *end == 'D') {
+	  cachetypeattr = HWLOC_OBJ_CACHE_DATA;
+	  suffix = end+1;
+	} else if (*end == 'u' || *end == 'U') {
+	  cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+	  suffix = end+1;
+	} else {
+	  cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+	  suffix = end;
+	}
+      } else
+	return -1;
     }
+    /* check whether the optional suffix matches "cache" */
+    if (!hwloc__type_match(suffix, "cache", 0))
+      return -1;
 
-  } else if (!hwloc_strncasecmp(string, "group", 2)) {
-    int length;
+  } else if ((end = (char *) hwloc__type_match(string, "group", 2)) != NULL) {
     type = HWLOC_OBJ_GROUP;
-    length = strcspn(string, "0123456789");
-    if (length <= 5 && !hwloc_strncasecmp(string, "group", length)
-	&& string[length] >= '0' && string[length] <= '9') {
-      depthattr = strtol(string+length, &end, 10);
+    if (*end >= '0' && *end <= '9') {
+      depthattr = strtol(end, &end, 10);
     }
+
   } else
     return -1;
 
   *typep = type;
-  if (depthattrp)
-    *depthattrp = depthattr;
-  if (typeattrp) {
-    if (type == HWLOC_OBJ_CACHE && sizeof(hwloc_obj_cache_type_t) <= typeattrsize)
-      memcpy(typeattrp, &cachetypeattr, sizeof(hwloc_obj_cache_type_t));
+  if (attrp) {
+    if (hwloc__obj_type_is_cache(type) && attrsize >= sizeof(attrp->cache)) {
+      attrp->cache.depth = depthattr;
+      attrp->cache.type = cachetypeattr;
+    } else if (type == HWLOC_OBJ_GROUP && attrsize >= sizeof(attrp->group)) {
+      attrp->group.depth = depthattr;
+    } else if (type == HWLOC_OBJ_BRIDGE && attrsize >= sizeof(attrp->bridge)) {
+      attrp->bridge.upstream_type = ubtype;
+      attrp->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI; /* nothing else so far */
+    } else if (type == HWLOC_OBJ_OS_DEVICE && attrsize >= sizeof(attrp->osdev)) {
+      attrp->osdev.type = ostype;
+    }
   }
-
   return 0;
 }
 
-static const char *
-hwloc_pci_class_string(unsigned short class_id)
+int
+hwloc_type_sscanf_as_depth(const char *string, hwloc_obj_type_t *typep,
+			   hwloc_topology_t topology, int *depthp)
 {
-  switch ((class_id & 0xff00) >> 8) {
-    case 0x00:
-      switch (class_id) {
-	case 0x0001: return "VGA";
-      }
-      return "PCI";
-    case 0x01:
-      switch (class_id) {
-	case 0x0100: return "SCSI";
-	case 0x0101: return "IDE";
-	case 0x0102: return "Flop";
-	case 0x0103: return "IPI";
-	case 0x0104: return "RAID";
-	case 0x0105: return "ATA";
-	case 0x0106: return "SATA";
-	case 0x0107: return "SAS";
-	case 0x0108: return "NVMExp";
-      }
-      return "Stor";
-    case 0x02:
-      switch (class_id) {
-	case 0x0200: return "Ether";
-	case 0x0201: return "TokRn";
-	case 0x0202: return "FDDI";
-	case 0x0203: return "ATM";
-	case 0x0204: return "ISDN";
-	case 0x0205: return "WrdFip";
-	case 0x0206: return "PICMG";
-	case 0x0207: return "IB";
-      }
-      return "Net";
-    case 0x03:
-      switch (class_id) {
-	case 0x0300: return "VGA";
-	case 0x0301: return "XGA";
-	case 0x0302: return "3D";
-      }
-      return "Disp";
-    case 0x04:
-      switch (class_id) {
-	case 0x0400: return "Video";
-	case 0x0401: return "Audio";
-	case 0x0402: return "Phone";
-	case 0x0403: return "Auddv";
-      }
-      return "MM";
-    case 0x05:
-      switch (class_id) {
-	case 0x0500: return "RAM";
-	case 0x0501: return "Flash";
-      }
-      return "Mem";
-    case 0x06:
-      switch (class_id) {
-	case 0x0600: return "Host";
-	case 0x0601: return "ISA";
-	case 0x0602: return "EISA";
-	case 0x0603: return "MC";
-	case 0x0604: return "PCI_B";
-	case 0x0605: return "PCMCIA";
-	case 0x0606: return "Nubus";
-	case 0x0607: return "CardBus";
-	case 0x0608: return "RACEway";
-	case 0x0609: return "PCI_SB";
-	case 0x060a: return "IB_B";
-      }
-      return "Bridg";
-    case 0x07:
-      switch (class_id) {
-	case 0x0700: return "Ser";
-	case 0x0701: return "Para";
-	case 0x0702: return "MSer";
-	case 0x0703: return "Modm";
-	case 0x0704: return "GPIB";
-	case 0x0705: return "SmrtCrd";
-      }
-      return "Comm";
-    case 0x08:
-      switch (class_id) {
-	case 0x0800: return "PIC";
-	case 0x0801: return "DMA";
-	case 0x0802: return "Time";
-	case 0x0803: return "RTC";
-	case 0x0804: return "HtPl";
-	case 0x0805: return "SD-HtPl";
-	case 0x0806: return "IOMMU";
-      }
-      return "Syst";
-    case 0x09:
-      switch (class_id) {
-	case 0x0900: return "Kbd";
-	case 0x0901: return "Pen";
-	case 0x0902: return "Mouse";
-	case 0x0903: return "Scan";
-	case 0x0904: return "Game";
-      }
-      return "In";
-    case 0x0a:
-      return "Dock";
-    case 0x0b:
-      switch (class_id) {
-	case 0x0b00: return "386";
-	case 0x0b01: return "486";
-	case 0x0b02: return "Pent";
-	case 0x0b10: return "Alpha";
-	case 0x0b20: return "PPC";
-	case 0x0b30: return "MIPS";
-	case 0x0b40: return "CoProc";
-      }
-      return "Proc";
-    case 0x0c:
-      switch (class_id) {
-	case 0x0c00: return "Firw";
-	case 0x0c01: return "ACCES";
-	case 0x0c02: return "SSA";
-	case 0x0c03: return "USB";
-	case 0x0c04: return "Fiber";
-	case 0x0c05: return "SMBus";
-	case 0x0c06: return "IB";
-	case 0x0c07: return "IPMI";
-	case 0x0c08: return "SERCOS";
-	case 0x0c09: return "CANBUS";
-      }
-      return "Ser";
-    case 0x0d:
-      switch (class_id) {
-	case 0x0d00: return "IRDA";
-	case 0x0d01: return "IR";
-	case 0x0d10: return "RF";
-	case 0x0d11: return "Blueth";
-	case 0x0d12: return "BroadB";
-	case 0x0d20: return "802.1a";
-	case 0x0d21: return "802.1b";
-      }
-      return "Wifi";
-    case 0x0e:
-      switch (class_id) {
-	case 0x0e00: return "I2O";
-      }
-      return "Intll";
-    case 0x0f:
-      switch (class_id) {
-	case 0x0f00: return "S-TV";
-	case 0x0f01: return "S-Aud";
-	case 0x0f02: return "S-Voice";
-	case 0x0f03: return "S-Data";
+  union hwloc_obj_attr_u attr;
+  hwloc_obj_type_t type;
+  int depth;
+  int err;
+
+  err = hwloc_type_sscanf(string, &type, &attr, sizeof(attr));
+  if (err < 0)
+    return err;
+
+  depth = hwloc_get_type_depth(topology, type);
+  if (type == HWLOC_OBJ_GROUP
+      && depth == HWLOC_TYPE_DEPTH_MULTIPLE
+      && attr.group.depth != (unsigned)-1) {
+    unsigned l;
+    depth = HWLOC_TYPE_DEPTH_UNKNOWN;
+    for(l=0; l<topology->nb_levels; l++) {
+      if (topology->levels[l][0]->type == HWLOC_OBJ_GROUP
+	  && topology->levels[l][0]->attr->group.depth == attr.group.depth) {
+	depth = (int)l;
+	break;
       }
-      return "Satel";
-    case 0x10:
-      return "Crypt";
-    case 0x11:
-      return "Signl";
-    case 0x12:
-      return "Accel";
-    case 0x13:
-      return "Instr";
-    case 0xff:
-      return "Oth";
+    }
   }
-  return "PCI";
+
+  if (typep)
+    *typep = type;
+  *depthp = depth;
+  return 0;
 }
 
 static const char* hwloc_obj_cache_type_letter(hwloc_obj_cache_type_t type)
@@ -464,33 +475,34 @@ hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
   hwloc_obj_type_t type = obj->type;
   switch (type) {
   case HWLOC_OBJ_MISC:
-  case HWLOC_OBJ_SYSTEM:
   case HWLOC_OBJ_MACHINE:
   case HWLOC_OBJ_NUMANODE:
+  case HWLOC_OBJ_MEMCACHE:
   case HWLOC_OBJ_PACKAGE:
+  case HWLOC_OBJ_DIE:
   case HWLOC_OBJ_CORE:
   case HWLOC_OBJ_PU:
     return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
-  case HWLOC_OBJ_CACHE:
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
     return hwloc_snprintf(string, size, "L%u%s%s", obj->attr->cache.depth,
 			  hwloc_obj_cache_type_letter(obj->attr->cache.type),
-			  verbose ? hwloc_obj_type_string(type): "");
+			  verbose ? "Cache" : "");
   case HWLOC_OBJ_GROUP:
-	  /* TODO: more pretty presentation? */
     if (obj->attr->group.depth != (unsigned) -1)
       return hwloc_snprintf(string, size, "%s%u", hwloc_obj_type_string(type), obj->attr->group.depth);
     else
       return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
   case HWLOC_OBJ_BRIDGE:
-    if (verbose)
-      return snprintf(string, size, "Bridge %s->%s",
-		      obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCI" : "Host",
-		      "PCI");
-    else
-      return snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
+    return hwloc_snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
   case HWLOC_OBJ_PCI_DEVICE:
-    return snprintf(string, size, "PCI %04x:%04x",
-		    obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
+    return hwloc_snprintf(string, size, "PCI");
   case HWLOC_OBJ_OS_DEVICE:
     switch (obj->attr->osdev.type) {
     case HWLOC_OBJ_OSDEV_BLOCK: return hwloc_snprintf(string, size, "Block");
@@ -500,7 +512,8 @@ hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
     case HWLOC_OBJ_OSDEV_GPU: return hwloc_snprintf(string, size, "GPU");
     case HWLOC_OBJ_OSDEV_COPROC: return hwloc_snprintf(string, size, verbose ? "Co-Processor" : "CoProc");
     default:
-      *string = '\0';
+      if (size > 0)
+	*string = '\0';
       return 0;
     }
     break;
@@ -527,25 +540,25 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
   /* print memory attributes */
   res = 0;
   if (verbose) {
-    if (obj->memory.local_memory)
+    if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory)
       res = hwloc_snprintf(tmp, tmplen, "%slocal=%lu%s%stotal=%lu%s",
 			   prefix,
-			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
-			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose),
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->numanode.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->numanode.local_memory, verbose),
 			   separator,
-			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
-			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
-    else if (obj->memory.total_memory)
+			   (unsigned long) hwloc_memory_size_printf_value(obj->total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->total_memory, verbose));
+    else if (obj->total_memory)
       res = hwloc_snprintf(tmp, tmplen, "%stotal=%lu%s",
 			   prefix,
-			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
-			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose));
+			   (unsigned long) hwloc_memory_size_printf_value(obj->total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->total_memory, verbose));
   } else {
-    if (obj->memory.local_memory)
+    if (obj->type == HWLOC_OBJ_NUMANODE && obj->attr->numanode.local_memory)
       res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
 			   prefix,
-			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
-			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->numanode.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->numanode.local_memory, verbose));
   }
   if (res < 0)
     return -1;
@@ -553,14 +566,22 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
   if (ret > 0)
     prefix = separator;
   if (res >= tmplen)
-    res = tmplen>0 ? tmplen - 1 : 0;
+    res = tmplen>0 ? (int)tmplen - 1 : 0;
   tmp += res;
   tmplen -= res;
 
   /* printf type-specific attributes */
   res = 0;
   switch (obj->type) {
-  case HWLOC_OBJ_CACHE:
+  case HWLOC_OBJ_L1CACHE:
+  case HWLOC_OBJ_L2CACHE:
+  case HWLOC_OBJ_L3CACHE:
+  case HWLOC_OBJ_L4CACHE:
+  case HWLOC_OBJ_L5CACHE:
+  case HWLOC_OBJ_L1ICACHE:
+  case HWLOC_OBJ_L2ICACHE:
+  case HWLOC_OBJ_L3ICACHE:
+  case HWLOC_OBJ_MEMCACHE:
     if (verbose) {
       char assoc[32];
       if (obj->attr->cache.associativity == -1)
@@ -599,23 +620,20 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
       snprintf(down, sizeof(down), "buses=%04x:[%02x-%02x]",
 	       obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
       if (*up)
-	res = snprintf(string, size, "%s%s%s", up, separator, down);
+	res = hwloc_snprintf(string, size, "%s%s%s", up, separator, down);
       else
-	res = snprintf(string, size, "%s", down);
+	res = hwloc_snprintf(string, size, "%s", down);
     }
     break;
   case HWLOC_OBJ_PCI_DEVICE:
     if (verbose) {
       char linkspeed[64]= "";
-      char busid[16] = "[collapsed]";
       if (obj->attr->pcidev.linkspeed)
         snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
-      if (!hwloc_obj_get_info_by_name(obj, "lstopoCollapse"))
-	snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
-		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func);
-      res = snprintf(string, size, "busid=%s%sclass=%04x(%s)%s",
-		     busid, separator,
-		     obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+      res = hwloc_snprintf(string, size, "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+			   obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+			   obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+			   obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
     }
     break;
   default:
@@ -627,7 +645,7 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
   if (ret > 0)
     prefix = separator;
   if (res >= tmplen)
-    res = tmplen>0 ? tmplen - 1 : 0;
+    res = tmplen>0 ? (int)tmplen - 1 : 0;
   tmp += res;
   tmplen -= res;
 
@@ -635,21 +653,17 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
   if (verbose) {
     unsigned i;
     for(i=0; i<obj->infos_count; i++) {
-      if (!strcmp(obj->infos[i].name, "lstopoCollapse"))
-	continue;
-      if (strchr(obj->infos[i].value, ' '))
-	res = hwloc_snprintf(tmp, tmplen, "%s%s=\"%s\"",
+      struct hwloc_info_s *info = &obj->infos[i];
+      const char *quote = strchr(info->value, ' ') ? "\"" : "";
+      res = hwloc_snprintf(tmp, tmplen, "%s%s=%s%s%s",
 			     prefix,
-			     obj->infos[i].name, obj->infos[i].value);
-      else
-	res = hwloc_snprintf(tmp, tmplen, "%s%s=%s",
-			     prefix,
-			     obj->infos[i].name, obj->infos[i].value);
+			     info->name,
+			     quote, info->value, quote);
       if (res < 0)
         return -1;
       ret += res;
       if (res >= tmplen)
-        res = tmplen>0 ? tmplen - 1 : 0;
+        res = tmplen>0 ? (int)tmplen - 1 : 0;
       tmp += res;
       tmplen -= res;
       if (ret > 0)
@@ -659,43 +673,3 @@ hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t
 
   return ret;
 }
-
-
-int
-hwloc_obj_snprintf(char *string, size_t size,
-    struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *l, const char *_indexprefix, int verbose)
-{
-  const char *indexprefix = _indexprefix ? _indexprefix : "#";
-  char os_index[12] = "";
-  char type[64];
-  char attr[128];
-  int attrlen;
-
-  if (l->os_index != (unsigned) -1) {
-    hwloc_snprintf(os_index, 12, "%s%u", indexprefix, l->os_index);
-  }
-
-  hwloc_obj_type_snprintf(type, sizeof(type), l, verbose);
-  attrlen = hwloc_obj_attr_snprintf(attr, sizeof(attr), l, " ", verbose);
-
-  if (attrlen > 0)
-    return hwloc_snprintf(string, size, "%s%s(%s)", type, os_index, attr);
-  else
-    return hwloc_snprintf(string, size, "%s%s", type, os_index);
-}
-
-int hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
-{
-  hwloc_bitmap_t set = hwloc_bitmap_alloc();
-  int res;
-  unsigned i;
-
-  hwloc_bitmap_zero(set);
-  for(i=0; i<nobj; i++)
-    if (objs[i]->cpuset)
-      hwloc_bitmap_or(set, set, objs[i]->cpuset);
-
-  res = hwloc_bitmap_snprintf(str, size, set);
-  hwloc_bitmap_free(set);
-  return res;
-}
diff --git a/ext/hwloc/include/hwloc.h b/ext/hwloc/include/hwloc.h
index 6c8d203d7..e106e9cc0 100644
--- a/ext/hwloc/include/hwloc.h
+++ b/ext/hwloc/include/hwloc.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -45,12 +45,16 @@
  * See hwloc/bitmap.h for bitmap specific macros.
  * See hwloc/helper.h for high-level topology traversal helpers.
  * See hwloc/inlines.h for the actual inline code of some functions below.
+ * See hwloc/export.h for exporting topologies to XML or to synthetic descriptions.
+ * See hwloc/distances.h for querying and modifying distances between objects.
+ * See hwloc/diff.h for manipulating differences between similar topologies.
  */
 
 #ifndef HWLOC_H
 #define HWLOC_H
 
-#include <hwloc/autogen/config.h>
+#include "hwloc/autogen/config.h"
+
 #include <sys/types.h>
 #include <stdio.h>
 #include <string.h>
@@ -59,13 +63,13 @@
 /*
  * Symbol transforms
  */
-#include <hwloc/rename.h>
+#include "hwloc/rename.h"
 
 /*
  * Bitmap definitions
  */
 
-#include <hwloc/bitmap.h>
+#include "hwloc/bitmap.h"
 
 
 #ifdef __cplusplus
@@ -77,14 +81,28 @@ extern "C" {
  * @{
  */
 
-/** \brief Indicate at build time which hwloc API version is being used. */
-#define HWLOC_API_VERSION 0x00020000
+/** \brief Indicate at build time which hwloc API version is being used.
+ *
+ * This number is updated to (X<<16)+(Y<<8)+Z when a new release X.Y.Z
+ * actually modifies the API.
+ *
+ * Users may check for available features at build time using this number
+ * (see \ref faq_version_api).
+ *
+ * \note This should not be confused with HWLOC_VERSION, the library version.
+ * Two stable releases of the same series usually have the same ::HWLOC_API_VERSION
+ * even if their HWLOC_VERSION are different.
+ */
+#define HWLOC_API_VERSION 0x00020100
 
-/** \brief Indicate at runtime which hwloc API version was used at build time. */
+/** \brief Indicate at runtime which hwloc API version was used at build time.
+ *
+ * Should be ::HWLOC_API_VERSION if running on the same version.
+ */
 HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
 
 /** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
-#define HWLOC_COMPONENT_ABI 5
+#define HWLOC_COMPONENT_ABI 6
 
 /** @} */
 
@@ -155,38 +173,22 @@ typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
  * may be defined in the future!  If you need to compare types, use
  * hwloc_compare_types() instead.
  */
+#define HWLOC_OBJ_TYPE_MIN HWLOC_OBJ_MACHINE /**< \private Sentinel value */
 typedef enum {
-    /* ***************************************************************
-       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
-
-       If new enum values are added here, you MUST also go update the
-       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
-
-       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
-       *************************************************************** */
-
-  HWLOC_OBJ_SYSTEM,	/**< \brief Whole system (may be a cluster of machines).
-  			  * The whole system that is accessible to hwloc.
-			  * That may comprise several machines in SSI systems.
-			  */
   HWLOC_OBJ_MACHINE,	/**< \brief Machine.
-			  * The typical root object type.
 			  * A set of processors and memory with cache
 			  * coherency.
-			  */
-  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
-			  * A set of processors around memory which the
-			  * processors can directly access.
 			  *
-			  * There is always at one such object in the topology
-			  * even if the machine is not NUMA.
-			  */
-  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package, what goes into a socket.
-			  * In the physical meaning, i.e. that you can add
-			  * or remove physically.
+			  * This type is always used for the root object of a topology,
+			  * and never used anywhere else.
+			  * Hence its parent is always \c NULL.
 			  */
-  HWLOC_OBJ_CACHE,	/**< \brief Cache.
-			  * Can be L1i, L1d, L2, L3, ...
+
+  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package.
+			  * The physical package that usually gets inserted
+			  * into a socket on the motherboard.
+			  * A processor package usually contains multiple cores,
+			  * and possibly some dies.
 			  */
   HWLOC_OBJ_CORE,	/**< \brief Core.
 			  * A computation unit (may be shared by several
@@ -197,10 +199,23 @@ typedef enum {
 			  * other logical processors, e.g. in the case of
 			  * an SMT core).
 			  *
+			  * This is the smallest object representing CPU resources,
+			  * it cannot have any child except Misc objects.
+			  *
 			  * Objects of this kind are always reported and can
 			  * thus be used as fallback when others are not.
 			  */
 
+  HWLOC_OBJ_L1CACHE,	/**< \brief Level 1 Data (or Unified) Cache. */
+  HWLOC_OBJ_L2CACHE,	/**< \brief Level 2 Data (or Unified) Cache. */
+  HWLOC_OBJ_L3CACHE,	/**< \brief Level 3 Data (or Unified) Cache. */
+  HWLOC_OBJ_L4CACHE,	/**< \brief Level 4 Data (or Unified) Cache. */
+  HWLOC_OBJ_L5CACHE,	/**< \brief Level 5 Data (or Unified) Cache. */
+
+  HWLOC_OBJ_L1ICACHE,	/**< \brief Level 1 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L2ICACHE,	/**< \brief Level 2 instruction Cache (filtered out by default). */
+  HWLOC_OBJ_L3ICACHE,	/**< \brief Level 3 instruction Cache (filtered out by default). */
+
   HWLOC_OBJ_GROUP,	/**< \brief Group objects.
 			  * Objects which do not fit in the above but are
 			  * detected by hwloc and are useful to take into
@@ -208,23 +223,34 @@ typedef enum {
 			  * expose their arbitrary processors aggregation this
 			  * way.  And hwloc may insert such objects to group
 			  * NUMA nodes according to their distances.
+			  * See also \ref faq_groups.
 			  *
-			  * These objects are ignored when they do not bring
-			  * any structure.
+			  * These objects are removed when they do not bring
+			  * any structure (see ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
 			  */
 
-  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects.
-			  * Objects without particular meaning, that can e.g. be
-			  * added by the application for its own use, or by hwloc
-			  * for miscellaneous objects such as MemoryDevice.
-			  * These objects are not listed in the main children list,
-			  * but rather in the dedicated misc children list.
-			  * Misc objects may only have Misc objects as children,
-			  * and those are in the dedicated misc children list as well.
-			  * Misc objects have NULL CPU and node sets.
+  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
+			  * An object that contains memory that is directly
+			  * and byte-accessible to the host processors.
+			  * It is usually close to some cores (the corresponding objects
+			  * are descendants of the NUMA node object in the hwloc tree).
+			  *
+			  * This is the smallest object representing Memory resources,
+			  * it cannot have any child except Misc objects.
+			  * However it may have Memory-side cache parents.
+			  *
+			  * There is always at least one such object in the topology
+			  * even if the machine is not NUMA.
+			  *
+			  * Memory objects are not listed in the main children list,
+			  * but rather in the dedicated Memory children list.
+			  *
+			  * NUMA nodes have a special depth ::HWLOC_TYPE_DEPTH_NUMANODE
+			  * instead of a normal depth just like other objects in the
+			  * main tree.
 			  */
 
-  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge.
+  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge (filtered out by default).
 			  * Any bridge that connects the host or an I/O bus,
 			  * to another I/O bus.
 			  * They are not added to the topology unless I/O discovery
@@ -233,14 +259,14 @@ typedef enum {
 			  * but rather in the dedicated io children list.
 			  * I/O objects have NULL CPU and node sets.
 			  */
-  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device.
+  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device (filtered out by default).
 			  * They are not added to the topology unless I/O discovery
 			  * is enabled with hwloc_topology_set_flags().
 			  * I/O objects are not listed in the main children list,
 			  * but rather in the dedicated io children list.
 			  * I/O objects have NULL CPU and node sets.
 			  */
-  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device.
+  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device (filtered out by default).
 			  * They are not added to the topology unless I/O discovery
 			  * is enabled with hwloc_topology_set_flags().
 			  * I/O objects are not listed in the main children list,
@@ -248,24 +274,43 @@ typedef enum {
 			  * I/O objects have NULL CPU and node sets.
 			  */
 
-  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
+  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects (filtered out by default).
+			  * Objects without particular meaning, that can e.g. be
+			  * added by the application for its own use, or by hwloc
+			  * for miscellaneous objects such as MemoryModule (DIMMs).
+			  * These objects are not listed in the main children list,
+			  * but rather in the dedicated misc children list.
+			  * Misc objects may only have Misc objects as children,
+			  * and those are in the dedicated misc children list as well.
+			  * Misc objects have NULL CPU and node sets.
+			  */
 
-    /* ***************************************************************
-       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+  HWLOC_OBJ_MEMCACHE,	/**< \brief Memory-side cache (filtered out by default).
+			  * A cache in front of a specific NUMA node.
+			  *
+			  * This object always has at least one NUMA node as a memory child.
+			  *
+			  * Memory objects are not listed in the main children list,
+			  * but rather in the dedicated Memory children list.
+			  *
+			  * Memory-side cache have a special depth ::HWLOC_TYPE_DEPTH_MEMCACHE
+			  * instead of a normal depth just like other objects in the
+			  * main tree.
+			  */
 
-       If new enum values are added here, you MUST also go update the
-       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+  HWLOC_OBJ_DIE,	/**< \brief Die within a physical package.
+			 * A subpart of the physical package, that contains multiple cores.
+			 * \hideinitializer
+			 */
 
-       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
-       *************************************************************** */
+  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
 } hwloc_obj_type_t;
 
 /** \brief Cache type. */
 typedef enum hwloc_obj_cache_type_e {
   HWLOC_OBJ_CACHE_UNIFIED,      /**< \brief Unified cache. */
   HWLOC_OBJ_CACHE_DATA,         /**< \brief Data cache. */
-  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache.
-				  * Only used when the HWLOC_TOPOLOGY_FLAG_ICACHES topology flag is set. */
+  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache (filtered out by default). */
 } hwloc_obj_cache_type_t;
 
 /** \brief Type of one side (upstream or downstream) of an I/O bridge. */
@@ -276,15 +321,16 @@ typedef enum hwloc_obj_bridge_type_e {
 
 /** \brief Type of a OS device. */
 typedef enum hwloc_obj_osdev_type_e {
-  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device.
-				  * For instance "sda" on Linux. */
+  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device, or non-volatile memory device.
+				  * For instance "sda" or "dax2.0" on Linux. */
   HWLOC_OBJ_OSDEV_GPU,		/**< \brief Operating system GPU device.
 				  * For instance ":0.0" for a GL display,
 				  * "card0" for a Linux DRM device. */
   HWLOC_OBJ_OSDEV_NETWORK,	/**< \brief Operating system network device.
 				  * For instance the "eth0" interface on Linux. */
   HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
-				  * For instance the "mlx4_0" InfiniBand HCA device on Linux. */
+				  * For instance the "mlx4_0" InfiniBand HCA,
+				  * or "hfi1_0" Omni-Path interface on Linux. */
   HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
 				  * For instance the "dma0chan0" DMA channel on Linux. */
   HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
@@ -300,12 +346,14 @@ typedef enum hwloc_obj_osdev_type_e {
  * respectively if \p type1 objects usually include \p type2 objects, are the
  * same as \p type2 objects, or are included in \p type2 objects. If the types
  * can not be compared (because neither is usually contained in the other),
- * HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
+ * ::HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
  * be compared (usually, a system contains machines which contain nodes which
  * contain packages which contain caches, which contain cores, which contain
  * processors).
  *
- * \note HWLOC_OBJ_PU will always be the deepest.
+ * \note ::HWLOC_OBJ_PU will always be the deepest,
+ * while ::HWLOC_OBJ_MACHINE is always the highest.
+ *
  * \note This does not mean that the actual topology will respect that order:
  * e.g. as of today cores may also contain caches, and packages may also contain
  * nodes. This is thus just to be seen as a fallback comparison method.
@@ -313,7 +361,7 @@ typedef enum hwloc_obj_osdev_type_e {
 HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
 
 enum hwloc_compare_types_e {
-    HWLOC_TYPE_UNORDERED = INT_MAX	/**< \brief Value returned by hwloc_compare_types when types can not be compared. \hideinitializer */
+    HWLOC_TYPE_UNORDERED = INT_MAX	/**< \brief Value returned by hwloc_compare_types() when types can not be compared. \hideinitializer */
 };
 
 /** @} */
@@ -326,51 +374,53 @@ enum hwloc_compare_types_e {
 
 union hwloc_obj_attr_u;
 
-/** \brief Object memory */
-struct hwloc_obj_memory_s {
-  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in this object and its children */
-  hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
-
-  /** \brief Size of array \p page_types */
-  unsigned page_types_len;
-  /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
-   *
-   * The array is sorted by increasing \p size fields.
-   * It contains \p page_types_len slots.
-   */
-  struct hwloc_obj_memory_page_type_s {
-    hwloc_uint64_t size;	/**< \brief Size of pages */
-    hwloc_uint64_t count;	/**< \brief Number of pages of this size */
-  } * page_types;
-};
-
 /** \brief Structure of a topology object
  *
- * Applications must not modify any field except hwloc_obj.userdata.
+ * Applications must not modify any field except \p hwloc_obj.userdata.
  */
 struct hwloc_obj {
   /* physical information */
   hwloc_obj_type_t type;		/**< \brief Type of object */
+  char *subtype;			/**< \brief Subtype string to better describe the type field. */
+
   unsigned os_index;			/**< \brief OS-provided physical index number.
 					 * It is not guaranteed unique across the entire machine,
 					 * except for PUs and NUMA nodes.
+					 * Set to HWLOC_UNKNOWN_INDEX if unknown or irrelevant for this object.
+					 */
+#define HWLOC_UNKNOWN_INDEX (unsigned)-1
+
+  char *name;				/**< \brief Object-specific name if any.
+					 * Mostly used for identifying OS devices and Misc objects where
+					 * a name string is more useful than numerical indexes.
 					 */
-  char *name;				/**< \brief Object description if any */
 
-  struct hwloc_obj_memory_s memory;	/**< \brief Memory attributes */
+  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in NUMA nodes below this object. */
 
   union hwloc_obj_attr_u *attr;		/**< \brief Object type-specific Attributes,
 					 * may be \c NULL if no attribute value was found */
 
   /* global position */
-  unsigned depth;			/**< \brief Vertical index in the hierarchy.
-					 * If the topology is symmetric, this is equal to the
-					 * parent depth plus one, and also equal to the number
-					 * of parent/child links from the root object to here.
+  int depth;				/**< \brief Vertical index in the hierarchy.
+					 *
+					 * For normal objects, this is the depth of the horizontal level
+					 * that contains this object and its cousins of the same type.
+					 * If the topology is symmetric, this is equal to the parent depth
+					 * plus one, and also equal to the number of parent/child links
+					 * from the root object to here.
+					 *
+					 * For special objects (NUMA nodes, I/O and Misc) that are not
+					 * in the main tree, this is a special negative value that
+					 * corresponds to their dedicated level,
+					 * see hwloc_get_type_depth() and ::hwloc_get_type_depth_e.
+					 * Those special values can be passed to hwloc functions such
+					 * hwloc_get_nbobjs_by_depth() as usual.
 					 */
   unsigned logical_index;		/**< \brief Horizontal index in the whole list of similar objects,
 					 * hence guaranteed unique across the entire machine.
 					 * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+					 * Note that this index may change when restricting the topology
+					 * or when inserting a group.
 					 */
 
   /* cousins are all objects of the same type (and depth) across the entire topology */
@@ -378,32 +428,70 @@ struct hwloc_obj {
   struct hwloc_obj *prev_cousin;	/**< \brief Previous object of same type and depth */
 
   /* children of the same parent are siblings, even if they may have different type and depth */
-  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (system object) */
-  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's I/O or Misc children list. */
-  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent */
-  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent */
-
-  /* children array below this object (except I/O and Misc children) */
-  unsigned arity;			/**< \brief Number of children */
-  struct hwloc_obj **children;		/**< \brief Children, \c children[0 .. arity -1] */
-  struct hwloc_obj *first_child;	/**< \brief First child */
-  struct hwloc_obj *last_child;		/**< \brief Last child */
+  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (Machine object) */
+  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's Memory, I/O or Misc children list. */
+  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent (inside the same list of children). */
+  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent (inside the same list of children). */
+  /** @name List and array of normal children below this object (except Memory, I/O and Misc children). */
+  /**@{*/
+  unsigned arity;			/**< \brief Number of normal children.
+					 * Memory, Misc and I/O children are not listed here
+					 * but rather in their dedicated children list.
+					 */
+  struct hwloc_obj **children;		/**< \brief Normal children, \c children[0 .. arity -1] */
+  struct hwloc_obj *first_child;	/**< \brief First normal child */
+  struct hwloc_obj *last_child;		/**< \brief Last normal child */
+  /**@}*/
 
   int symmetric_subtree;		/**< \brief Set if the subtree of normal objects below this object is symmetric,
-					  * which means all children and their children have identical subtrees.
-					  * I/O and Misc children are ignored.
+					  * which means all normal children and their children have identical subtrees.
+					  *
+					  * Memory, I/O and Misc children are ignored.
 					  *
 					  * If set in the topology root object, lstopo may export the topology
 					  * as a synthetic string.
 					  */
 
-  /* specific list of I/O children */
-  unsigned io_arity;			/**< \brief Number of I/O children */
-  struct hwloc_obj *io_first_child;	/**< \brief First I/O child */
+  /** @name List of Memory children below this object. */
+  /**@{*/
+  unsigned memory_arity;		/**< \brief Number of Memory children.
+					 * These children are listed in \p memory_first_child.
+					 */
+  struct hwloc_obj *memory_first_child;	/**< \brief First Memory child.
+					 * NUMA nodes and Memory-side caches are listed here
+					 * (\p memory_arity and \p memory_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_memory().
+					 *
+					 * A memory hierarchy starts from a normal CPU-side object
+					 * (e.g. Package) and ends with NUMA nodes as leaves.
+					 * There might exist some memory-side caches between them
+					 * in the middle of the memory subtree.
+					 */
+  /**@}*/
+
+  /** @name List of I/O children below this object. */
+  /**@{*/
+  unsigned io_arity;			/**< \brief Number of I/O children.
+					 * These children are listed in \p io_first_child.
+					 */
+  struct hwloc_obj *io_first_child;	/**< \brief First I/O child.
+					 * Bridges, PCI and OS devices are listed here (\p io_arity and \p io_first_child)
+					 * instead of in the normal children list.
+					 * See also hwloc_obj_type_is_io().
+					 */
+  /**@}*/
 
-  /* specific list of Misc children */
-  unsigned misc_arity;			/**< \brief Number of Misc children */
-  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child */
+  /** @name List of Misc children below this object. */
+  /**@{*/
+  unsigned misc_arity;			/**< \brief Number of Misc children.
+					 * These children are listed in \p misc_first_child.
+					 */
+  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child.
+					 * Misc objects are listed here (\p misc_arity and \p misc_first_child)
+					 * instead of in the normal children list.
+					 */
+  /**@}*/
 
   /* cpusets and nodesets */
   hwloc_cpuset_t cpuset;		/**< \brief CPUs covered by this object
@@ -413,91 +501,64 @@ struct hwloc_obj {
                                           * object and known how (the children path between this object and the PU
                                           * objects).
                                           *
-                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
-                                          * some of these CPUs may not be allowed for binding, see allowed_cpuset.
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
+                                          * some of these CPUs may not be allowed for binding,
+                                          * see hwloc_topology_get_allowed_cpuset().
                                           *
 					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
 					  *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
                                           */
   hwloc_cpuset_t complete_cpuset;       /**< \brief The complete CPU set of logical processors of this object,
                                           *
-                                          * This may include not only the same as the cpuset field, but also the CPUs for
-                                          * which topology information is unknown or incomplete, the offlines CPUS, and
-                                          * the CPUs that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag
+                                          * This may include not only the same as the cpuset field, but also some CPUs for
+                                          * which topology information is unknown or incomplete, some offlines CPUs, and
+                                          * the CPUs that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED flag
                                           * is not set.
                                           * Thus no corresponding PU object may be found in the topology, because the
                                           * precise position is undefined. It is however known that it would be somewhere
                                           * under this object.
                                           *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
-                                          */
-  hwloc_cpuset_t allowed_cpuset;        /**< \brief The CPU set of allowed logical processors
-                                          *
-                                          * This includes the CPUs contained in this object which are allowed for
-                                          * binding, i.e. passing them to the hwloc binding functions should not return
-                                          * permission errors.  This is usually restricted by administration rules.
-                                          *
-                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
-                                          * allowed_cpuset may be smaller than cpuset. Otherwise they are identical.
-                                          *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
                                           */
 
   hwloc_nodeset_t nodeset;              /**< \brief NUMA nodes covered by this object or containing this object
                                           *
-                                          * This is the set of NUMA nodes for which there are NODE objects in the
+                                          * This is the set of NUMA nodes for which there are NUMA node objects in the
                                           * topology under or above this object, i.e. which are known to be physically
                                           * contained in this object or containing it and known how (the children path
-                                          * between this object and the NODE objects).
+                                          * between this object and the NUMA node objects).
                                           *
                                           * In the end, these nodes are those that are close to the current object.
                                           *
-                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
-                                          * some of these nodes may not be allowed for allocation, see allowed_nodeset.
+                                          * If the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED configuration flag is set,
+                                          * some of these nodes may not be allowed for allocation,
+                                          * see hwloc_topology_get_allowed_nodeset().
                                           *
                                           * If there are no NUMA nodes in the machine, all the memory is close to this
                                           * object, so only the first bit may be set in \p nodeset.
                                           *
 					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
 					  *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
                                           */
   hwloc_nodeset_t complete_nodeset;     /**< \brief The complete NUMA node set of this object,
                                           *
-                                          * This may include not only the same as the nodeset field, but also the NUMA
-                                          * nodes for which topology information is unknown or incomplete, the offlines
-                                          * nodes, and the nodes that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM
+                                          * This may include not only the same as the nodeset field, but also some NUMA
+                                          * nodes for which topology information is unknown or incomplete, some offlines
+                                          * nodes, and the nodes that are ignored when the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
                                           * flag is not set.
-                                          * Thus no corresponding NODE object may be found in the topology, because the
+                                          * Thus no corresponding NUMA node object may be found in the topology, because the
                                           * precise position is undefined. It is however known that it would be
                                           * somewhere under this object.
                                           *
                                           * If there are no NUMA nodes in the machine, all the memory is close to this
                                           * object, so only the first bit is set in \p complete_nodeset.
                                           *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
-                                          */
-  hwloc_nodeset_t allowed_nodeset;      /**< \brief The set of allowed NUMA memory nodes
-                                          *
-                                          * This includes the NUMA memory nodes contained in this object which are
-                                          * allowed for memory allocation, i.e. passing them to NUMA node-directed
-                                          * memory allocation should not return permission errors. This is usually
-                                          * restricted by administration rules.
-                                          *
-                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
-                                          * allowed_nodeset may be smaller than nodeset. Otherwise they are identical.
-                                          *
-                                          * If there are no NUMA nodes in the machine, all the memory is close to this
-                                          * object, so only the first bit may be set in \p allowed_nodeset.
-                                          *
-                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
                                           */
 
-  struct hwloc_distances_s **distances;	/**< \brief Distances between all objects at same depth below this object */
-  unsigned distances_count;
-
-  struct hwloc_obj_info_s *infos;	/**< \brief Array of stringified info type=name. */
+  struct hwloc_info_s *infos;		/**< \brief Array of stringified info type=name. */
   unsigned infos_count;			/**< \brief Size of infos array. */
 
   /* misc */
@@ -505,6 +566,12 @@ struct hwloc_obj {
 					 * initialized to \c NULL, use it as you wish.
 					 * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
 					 * if you wish to export this field to XML. */
+
+  hwloc_uint64_t gp_index;			/**< \brief Global persistent index.
+					 * Generated by hwloc, unique across the topology (contrary to os_index)
+					 * and persistent across topology changes (contrary to logical_index).
+					 * Mostly used internally, but could also be used by application to identify objects.
+					 */
 };
 /**
  * \brief Convenience typedef; a pointer to a struct hwloc_obj.
@@ -513,6 +580,21 @@ typedef struct hwloc_obj * hwloc_obj_t;
 
 /** \brief Object type-specific Attributes */
 union hwloc_obj_attr_u {
+  /** \brief NUMA node-specific Object Attributes */
+  struct hwloc_numanode_attr_s {
+    hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+    unsigned page_types_len; /**< \brief Size of array \p page_types */
+    /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+     *
+     * The array is sorted by increasing \p size fields.
+     * It contains \p page_types_len slots.
+     */
+    struct hwloc_memory_page_type_s {
+      hwloc_uint64_t size;	/**< \brief Size of pages */
+      hwloc_uint64_t count;	/**< \brief Number of pages of this size */
+    } * page_types;
+  } numanode;
+
   /** \brief Cache-specific Object Attributes */
   struct hwloc_cache_attr_s {
     hwloc_uint64_t size;		  /**< \brief Size of cache in bytes */
@@ -524,7 +606,11 @@ union hwloc_obj_attr_u {
   } cache;
   /** \brief Group-specific Object Attributes */
   struct hwloc_group_attr_s {
-    unsigned depth;			  /**< \brief Depth of group object */
+    unsigned depth;			  /**< \brief Depth of group object.
+					   *   It may change if intermediate Group objects are added. */
+    unsigned kind;			  /**< \brief Internally-used kind of group. */
+    unsigned subkind;			  /**< \brief Internally-used subkind to distinguish different levels of groups with same kind */
+    unsigned char dont_merge;		  /**< \brief Flag preventing groups from being automatically merged with identical parent or children. */
   } group;
   /** \brief PCI Device specific Object Attributes */
   struct hwloc_pcidev_attr_s {
@@ -556,48 +642,11 @@ union hwloc_obj_attr_u {
   } osdev;
 };
 
-/** \brief Distances between objects
- *
- * One object may contain a distance structure describing distances
- * between all its descendants at a given relative depth. If the
- * containing object is the root object of the topology, then the
- * distances are available for all objects in the machine.
- *
- * If the \p latency pointer is not \c NULL, the pointed array contains
- * memory latencies (non-zero values), see below.
- *
- * In the future, some other types of distances may be considered.
- * In these cases, \p latency may be \c NULL.
- */
-struct hwloc_distances_s {
-  unsigned relative_depth;	/**< \brief Relative depth of the considered objects
-				 * below the object containing this distance information. */
-  unsigned nbobjs;		/**< \brief Number of objects considered in the matrix.
-				 * It is the number of descendant objects at \p relative_depth
-				 * below the containing object.
-				 * It corresponds to the result of hwloc_get_nbobjs_inside_cpuset_by_depth(). */
-
-  float *latency;		/**< \brief Matrix of latencies between objects, stored as a one-dimension array.
-				 * May be \c NULL if the distances considered here are not latencies.
-				 *
-				 * Unless defined by the user, this currently contains latencies
-				 * between NUMA nodes (as reported in the System Locality Distance Information Table
-				 * (SLIT) in the ACPI specification), which may or may not be accurate.
-				 * It corresponds to the latency for accessing the memory of one node
-				 * from a core in another node.
-				 *
-				 * Values are normalized to get 1.0 as the minimal value in the matrix.
-				 * Latency from i-th to j-th object is stored in slot i*nbobjs+j.
-				 */
-  float latency_max;		/**< \brief The maximal value in the latency matrix. */
-  float latency_base;		/**< \brief The multiplier that should be applied to latency matrix
-				 * to retrieve the original OS-provided latencies.
-				 * Usually 10 on Linux since ACPI SLIT uses 10 for local latency.
-				 */
-};
-
-/** \brief Object info */
-struct hwloc_obj_info_s {
+/** \brief Object info
+ *
+ * \sa hwlocality_info_attr
+ */
+struct hwloc_info_s {
   char *name;	/**< \brief Info name */
   char *value;	/**< \brief Info value */
 };
@@ -640,6 +689,9 @@ HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
  *
  * \note This function may be called only once per topology.
  *
+ * \note The binding of the current thread or process may temporarily change
+ * during this call but it will be restored before it returns.
+ *
  * \sa hwlocality_configuration and hwlocality_setsource
  */
 HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
@@ -656,9 +708,31 @@ HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
  * are duplicated into a new one.
  *
  * This is useful for keeping a backup while modifying a topology.
+ *
+ * \note Object userdata is not duplicated since hwloc does not know what it point to.
+ * The objects of both old and new topologies will point to the same userdata.
  */
 HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
 
+/** \brief Verify that the topology is compatible with the current hwloc library.
+ *
+ * This is useful when using the same topology structure (in memory)
+ * in different libraries that may use different hwloc installations
+ * (for instance if one library embeds a specific version of hwloc,
+ * while another library uses a default system-wide hwloc installation).
+ *
+ * If all libraries/programs use the same hwloc installation, this function
+ * always returns success.
+ *
+ * \return \c 0 on success.
+ *
+ * \return \c -1 with \p errno set to \c EINVAL if incompatible.
+ *
+ * \note If sharing between processes with hwloc_shmem_topology_write(),
+ * the relevant check is already performed inside hwloc_shmem_topology_adopt().
+ */
+HWLOC_DECLSPEC int hwloc_topology_abi_check(hwloc_topology_t topology);
+
 /** \brief Run internal checks on a topology structure
  *
  * The program aborts if an inconsistency is detected in the given topology.
@@ -687,51 +761,82 @@ HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
 
 /** \brief Get the depth of the hierarchical tree of objects.
  *
- * This is the depth of HWLOC_OBJ_PU objects plus one.
+ * This is the depth of ::HWLOC_OBJ_PU objects plus one.
+ *
+ * \note NUMA nodes, I/O and Misc objects are ignored when computing
+ * the depth of the tree (they are placed on special levels).
  */
-HWLOC_DECLSPEC unsigned hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+HWLOC_DECLSPEC int hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
 
 /** \brief Returns the depth of objects of type \p type.
  *
  * If no object of this type is present on the underlying architecture, or if
  * the OS doesn't provide this kind of information, the function returns
- * HWLOC_TYPE_DEPTH_UNKNOWN.
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN.
  *
  * If type is absent but a similar type is acceptable, see also
  * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
  *
- * If some objects of the given type exist in different levels,
- * for instance L1 and L2 caches, or L1i and L1d caches,
- * the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
- * See hwloc_get_cache_type_depth() in hwloc/helper.h to better handle this
- * case.
+ * If ::HWLOC_OBJ_GROUP is given, the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE
+ * if multiple levels of Groups exist.
  *
- * If an I/O object type is given, the function returns a virtual value
- * because I/O objects are stored in special levels that are not CPU-related.
+ * If a NUMA node, I/O or Misc object type is given, the function returns a virtual
+ * value because these objects are stored in special levels that are not CPU-related.
  * This virtual depth may be passed to other hwloc functions such as
  * hwloc_get_obj_by_depth() but it should not be considered as an actual
  * depth by the application. In particular, it should not be compared with
  * any other object depth or with the entire topology depth.
+ * \sa hwloc_get_memory_parents_depth().
+ *
+ * \sa hwloc_type_sscanf_as_depth() for returning the depth of objects
+ * whose type is given as a string.
  */
 HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
 
 enum hwloc_get_type_depth_e {
     HWLOC_TYPE_DEPTH_UNKNOWN = -1,    /**< \brief No object of given type exists in the topology. \hideinitializer */
-    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology. \hideinitializer */
-    HWLOC_TYPE_DEPTH_BRIDGE = -3,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
-    HWLOC_TYPE_DEPTH_PCI_DEVICE = -4, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
-    HWLOC_TYPE_DEPTH_OS_DEVICE = -5,  /**< \brief Virtual depth for software device object level. \hideinitializer */
-    HWLOC_TYPE_DEPTH_MISC = -6        /**< \brief Virtual depth for Misc object. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology (only for Groups). \hideinitializer */
+    HWLOC_TYPE_DEPTH_NUMANODE = -3,   /**< \brief Virtual depth for NUMA nodes. \hideinitializer */
+    HWLOC_TYPE_DEPTH_BRIDGE = -4,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_PCI_DEVICE = -5, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_OS_DEVICE = -6,  /**< \brief Virtual depth for software device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MISC = -7,       /**< \brief Virtual depth for Misc object. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MEMCACHE = -8    /**< \brief Virtual depth for MemCache object. \hideinitializer */
 };
 
+/** \brief Return the depth of parents where memory objects are attached.
+ *
+ * Memory objects have virtual negative depths because they are not part of
+ * the main CPU-side hierarchy of objects. This depth should not be compared
+ * with other level depths.
+ *
+ * If all Memory objects are attached to Normal parents at the same depth,
+ * this parent depth may be compared to other as usual, for instance
+ * for knowing whether NUMA nodes is attached above or below Packages.
+ *
+ * \return The depth of Normal parents of all memory children
+ * if all these parents have the same depth. For instance the depth of
+ * the Package level if all NUMA nodes are attached to Package objects.
+ *
+ * \return ::HWLOC_TYPE_DEPTH_MULTIPLE if Normal parents of all
+ * memory children do not have the same depth. For instance if some
+ * NUMA nodes are attached to Packages while others are attached to
+ * Groups.
+ */
+HWLOC_DECLSPEC int hwloc_get_memory_parents_depth (hwloc_topology_t topology);
+
 /** \brief Returns the depth of objects of type \p type or below
  *
  * If no object of this type is present on the underlying architecture, the
  * function returns the depth of the first "present" object typically found
  * inside \p type.
  *
- * If some objects of the given type exist in different levels, for instance
- * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
  */
 static __hwloc_inline int
 hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
@@ -742,21 +847,27 @@ hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
  * function returns the depth of the first "present" object typically
  * containing \p type.
  *
- * If some objects of the given type exist in different levels, for instance
- * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ * This function is only meaningful for normal object types.
+ * If a memory, I/O or Misc object type is given, the corresponding virtual
+ * depth is always returned (see hwloc_get_type_depth()).
+ *
+ * May return ::HWLOC_TYPE_DEPTH_MULTIPLE for ::HWLOC_OBJ_GROUP just like
+ * hwloc_get_type_depth().
  */
 static __hwloc_inline int
 hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
 
 /** \brief Returns the type of objects at depth \p depth.
  *
- * \return -1 if depth \p depth does not exist.
+ * \p depth should between 0 and hwloc_topology_get_depth()-1.
+ *
+ * \return (hwloc_obj_type_t)-1 if depth \p depth does not exist.
  */
-HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
 
 /** \brief Returns the width of level at depth \p depth.
  */
-HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
 
 /** \brief Returns the width of level type \p type
  *
@@ -768,20 +879,19 @@ hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hw
 
 /** \brief Returns the top-object of the topology-tree.
  *
- * Its type is typically ::HWLOC_OBJ_MACHINE but it could be different
- * for complex topologies.
+ * Its type is ::HWLOC_OBJ_MACHINE.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
 
 /** \brief Returns the topology object at logical index \p idx from depth \p depth */
-HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, unsigned depth, unsigned idx) __hwloc_attribute_pure;
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, int depth, unsigned idx) __hwloc_attribute_pure;
 
 /** \brief Returns the topology object at logical index \p idx with type \p type
  *
  * If no object for that type exists, \c NULL is returned.
- * If there are several levels with objects of that type, \c NULL is returned
- * and ther caller may fallback to hwloc_get_obj_by_depth().
+ * If there are several levels with objects of that type (::HWLOC_OBJ_GROUP),
+ * \c NULL is returned and the caller may fallback to hwloc_get_obj_by_depth().
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
@@ -791,7 +901,7 @@ hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigne
  * If \p prev is \c NULL, return the first object at depth \p depth.
  */
 static __hwloc_inline hwloc_obj_t
-hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev);
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev);
 
 /** \brief Returns the next object of type \p type.
  *
@@ -807,56 +917,40 @@ hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
 
 
 
-/** \defgroup hwlocality_object_strings Manipulating Object Type, Sets and Attributes as Strings
+/** \defgroup hwlocality_object_strings Converting between Object Types and Attributes, and Strings
  * @{
  */
 
-/** \brief Return a stringified topology object type */
-HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
-
-/** \brief Return an object type and attributes from a type string.
- *
- * Convert strings such as "Package" or "Cache" into the corresponding types.
- * Matching is case-insensitive, and only the first letters are actually
- * required to match.
- *
- * Types that have specific attributes, for instance caches and groups,
- * may be returned in \p depthattrp and \p typeattrp. They are ignored
- * when these pointers are \c NULL.
- *
- * For instance "L2i" or "L2iCache" would return
- * type HWLOC_OBJ_CACHE in \p typep, 2 in \p depthattrp,
- * and HWLOC_OBJ_CACHE_TYPE_INSTRUCTION in \p typeattrp
- * (this last pointer should point to a hwloc_obj_cache_type_t).
- * "Group3" would return type HWLOC_OBJ_GROUP type and 3 in \p depthattrp.
- * Attributes that are not specified in the string (for instance "Group"
- * without a depth, or "L2Cache" without a cache type) are set to -1.
+/** \brief Return a constant stringified object type.
  *
- * \p typeattrd is only filled if the size specified in \p typeattrsize
- * is large enough. It is currently only used for caches, and the required
- * size is at least the size of hwloc_obj_cache_type_t.
+ * This function is the basic way to convert a generic type into a string.
+ * The output string may be parsed back by hwloc_type_sscanf().
  *
- * \return 0 if a type was correctly identified, otherwise -1.
- *
- * \note This is an extended version of the now deprecated hwloc_obj_type_of_string()
+ * hwloc_obj_type_snprintf() may return a more precise output for a specific
+ * object, but it requires the caller to provide the output buffer.
  */
-HWLOC_DECLSPEC int hwloc_obj_type_sscanf(const char *string,
-					 hwloc_obj_type_t *typep,
-					 int *depthattrp,
-					 void *typeattrp, size_t typeattrsize);
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
 
 /** \brief Stringify the type of a given topology object into a human-readable form.
  *
- * It differs from hwloc_obj_type_string() because it prints type attributes such
- * as cache depth and type.
+ * Contrary to hwloc_obj_type_string(), this function includes object-specific
+ * attributes (such as the Group depth, the Bridge type, or OS device type)
+ * in the output, and it requires the caller to provide the output buffer.
+ *
+ * The output is guaranteed to be the same for all objects of a same topology level.
+ *
+ * If \p verbose is 1, longer type names are used, e.g. L1Cache instead of L1.
+ *
+ * The output string may be parsed back by hwloc_type_sscanf().
  *
  * If \p size is 0, \p string may safely be \c NULL.
  *
  * \return the number of character that were actually written if not truncating,
  * or that would have been written (not including the ending \\0).
  */
-HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj,
-				   int verbose);
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj,
+					   int verbose);
 
 /** \brief Stringify the attributes of a given topology object into a human-readable form.
  *
@@ -869,17 +963,70 @@ HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_
  * \return the number of character that were actually written if not truncating,
  * or that would have been written (not including the ending \\0).
  */
-HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * __hwloc_restrict separator,
-				   int verbose);
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size,
+					   hwloc_obj_t obj, const char * __hwloc_restrict separator,
+					   int verbose);
 
-/** \brief Stringify the cpuset containing a set of objects.
+/** \brief Return an object type and attributes from a type string.
  *
- * If \p size is 0, \p string may safely be \c NULL.
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
  *
- * \return the number of character that were actually written if not truncating,
- * or that would have been written (not including the ending \\0).
+ * The matched object type is set in \p typep (which cannot be \c NULL).
+ *
+ * Type-specific attributes, for instance Cache type, Cache depth, Group depth,
+ * Bridge type or OS Device type may be returned in \p attrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p attrp is only filled if not \c NULL and if its size specified in \p attrsize
+ * is large enough. It should be at least as large as union hwloc_obj_attr_u.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_sscanf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf(const char *string,
+				     hwloc_obj_type_t *typep,
+				     union hwloc_obj_attr_u *attrp, size_t attrsize);
+
+/** \brief Return an object type and its level depth from a type string.
+ *
+ * Convert strings such as "Package" or "L1iCache" into the corresponding types
+ * and return in \p depthp the depth of the corresponding level in the
+ * topology \p topology.
+ *
+ * If no object of this type is present on the underlying architecture,
+ * ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If multiple such levels exist (for instance if giving Group without any depth),
+ * the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE instead.
+ *
+ * The matched object type is set in \p typep if \p typep is non \c NULL.
+ *
+ * \note This function is similar to hwloc_type_sscanf() followed
+ * by hwloc_get_type_depth() but it also automatically disambiguates
+ * multiple group levels etc.
+ *
+ * \note This function is guaranteed to match any string returned by
+ * hwloc_obj_type_string() or hwloc_obj_type_snprintf().
+ */
+HWLOC_DECLSPEC int hwloc_type_sscanf_as_depth(const char *string,
+					      hwloc_obj_type_t *typep,
+					      hwloc_topology_t topology, int *depthp);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_info_attr Consulting and Adding Key-Value Info Attributes
+ *
+ * @{
  */
-HWLOC_DECLSPEC int hwloc_obj_cpuset_snprintf(char * __hwloc_restrict str, size_t size, size_t nobj, const hwloc_obj_t * __hwloc_restrict objs);
 
 /** \brief Search the given key name in object infos and return the corresponding value.
  *
@@ -897,6 +1044,8 @@ hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_
  *
  * The input strings are copied before being added in the object infos.
  *
+ * \return \c 0 on success, \c -1 on error.
+ *
  * \note This function may be used to enforce object colors in the lstopo
  * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
  * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
@@ -904,7 +1053,7 @@ hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_
  * \note If \p value contains some non-printable characters, they will
  * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
  */
-HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
 
 /** @} */
 
@@ -912,10 +1061,13 @@ HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const
 
 /** \defgroup hwlocality_cpubinding CPU binding
  *
- * It is often useful to call hwloc_bitmap_singlify() first so that a single CPU
- * remains in the set. This way, the process will not even migrate between
- * different CPUs inside the given set.
- * Some operating systems also only support that kind of binding.
+ * Some operating systems only support binding threads or processes to a single PU.
+ * Others allow binding to larger sets such as entire Cores or Packages or
+ * even random sets of invididual PUs. In such operating system, the scheduler
+ * is free to run the task on one of these PU, then migrate it to another PU, etc.
+ * It is often useful to call hwloc_bitmap_singlify() on the target CPU set before
+ * passing it to the binding function to avoid these expensive migrations.
+ * See the documentation of hwloc_bitmap_singlify() for details.
  *
  * Some operating systems do not provide all hwloc-supported
  * mechanisms to bind processes, threads, etc.
@@ -957,7 +1109,7 @@ HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const
  * \note On some operating systems, CPU binding may have effects on memory binding, see
  * ::HWLOC_CPUBIND_NOMEMBIND
  *
- * \note Running lstopo --top or hwloc-ps can be a very convenient tool to check
+ * \note Running lstopo \--top or hwloc-ps can be a very convenient tool to check
  * how binding actually happened.
  * @{
  */
@@ -1046,10 +1198,10 @@ HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t s
  * and \p HANDLE on native Windows platforms.
  *
  * \note As a special case on Linux, if a tid (thread ID) is supplied
- * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
  * the binding is applied to that specific thread.
  *
- * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
  */
 HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
 
@@ -1072,7 +1224,7 @@ HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t
  * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
- * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
  */
 HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
 #endif
@@ -1083,7 +1235,7 @@ HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thr
  * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
- * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ * \note ::HWLOC_CPUBIND_PROCESS can not be used in \p flags.
  */
 HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
 #endif
@@ -1095,7 +1247,7 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
  * so this function may return something that is already
  * outdated.
  *
- * \p flags can include either HWLOC_CPUBIND_PROCESS or HWLOC_CPUBIND_THREAD to
+ * \p flags can include either ::HWLOC_CPUBIND_PROCESS or ::HWLOC_CPUBIND_THREAD to
  * specify whether the query should be for the whole process (union of all CPUs
  * on which all threads are running), or only the current thread. If the
  * process is single-threaded, flags can be set to zero to let hwloc use
@@ -1114,10 +1266,10 @@ HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_
  * and \p HANDLE on native Windows platforms.
  *
  * \note As a special case on Linux, if a tid (thread ID) is supplied
- * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * instead of a pid (process ID) and ::HWLOC_CPUBIND_THREAD is passed in flags,
  * the last CPU location of that specific thread is returned.
  *
- * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ * \note On non-Linux systems, ::HWLOC_CPUBIND_THREAD can not be used in \p flags.
  */
 HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
 
@@ -1148,7 +1300,7 @@ HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, h
  * (e.g., some systems only allow binding memory on a per-thread
  * basis, whereas other systems only allow binding memory for all
  * threads in a process).
- * \p errno will be set to EXDEV when the requested cpuset can not be enforced
+ * \p errno will be set to EXDEV when the requested set can not be enforced
  * (e.g., some systems only allow binding memory to a single NUMA node).
  *
  * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
@@ -1171,14 +1323,17 @@ HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, h
  *                            HWLOC_MEMBIND_BIND, 0);
  * \endcode
  *
- * Each hwloc memory binding function is available in two forms: one
- * that takes a CPU set argument and another that takes a NUMA memory
- * node set argument (see \ref hwlocality_object_sets and \ref
- * hwlocality_bitmap for a discussion of CPU sets and NUMA memory node
- * sets).  The names of the latter form end with _nodeset.  It is also
- * possible to convert between CPU set and node set using
+ * Each hwloc memory binding function takes a bitmap argument that
+ * is a CPU set by default, or a NUMA memory node set if the flag
+ * ::HWLOC_MEMBIND_BYNODESET is specified.
+ * See \ref hwlocality_object_sets and \ref hwlocality_bitmap for a
+ * discussion of CPU sets and NUMA memory node sets.
+ * It is also possible to convert between CPU set and node set using
  * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
  *
+ * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+ * Binding by nodeset should therefore be preferred whenever possible.
+ *
  * \sa Some example codes are available under doc/examples/ in the source tree.
  *
  * \note On some operating systems, memory binding affects the CPU
@@ -1200,18 +1355,21 @@ HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, h
 typedef enum {
   /** \brief Reset the memory allocation policy to the system default.
    * Depending on the operating system, this may correspond to
-   * HWLOC_MEMBIND_FIRSTTOUCH (Linux),
-   * or HWLOC_MEMBIND_BIND (AIX, HP-UX, OSF, Solaris, Windows).
+   * ::HWLOC_MEMBIND_FIRSTTOUCH (Linux),
+   * or ::HWLOC_MEMBIND_BIND (AIX, HP-UX, Solaris, Windows).
+   * This policy is never returned by get membind functions.
+   * The nodeset argument is ignored.
    * \hideinitializer */
   HWLOC_MEMBIND_DEFAULT =	0,
 
-  /** \brief Allocate memory
-   * but do not immediately bind it to a specific locality. Instead,
-   * each page in the allocation is bound only when it is first
-   * touched. Pages are individually bound to the local NUMA node of
-   * the first thread that touches it. If there is not enough memory
-   * on the node, allocation may be done in the specified cpuset
-   * before allocating on other nodes.
+  /** \brief Allocate each memory page individually on the local NUMA
+   * node of the thread that touches it.
+   *
+   * The given nodeset should usually be hwloc_topology_get_topology_nodeset()
+   * so that the touching thread may run and allocate on any node in the system.
+   *
+   * On AIX, if the nodeset is smaller, pages are allocated locally (if the local
+   * node is in the nodeset) or from a random non-local node (otherwise).
    * \hideinitializer */
   HWLOC_MEMBIND_FIRSTTOUCH =	1,
 
@@ -1228,29 +1386,18 @@ typedef enum {
    * \hideinitializer */
   HWLOC_MEMBIND_INTERLEAVE =	3,
 
-  /** \brief Replicate memory on the given nodes; reads from this
-   * memory will attempt to be serviced from the NUMA node local to
-   * the reading thread. Replicating can be useful when multiple
-   * threads from the specified NUMA nodes will be sharing the same
-   * read-only data.
-   *
-   * This policy can only be used with existing memory allocations
-   * (i.e., the hwloc_set_*membind*() functions); it cannot be used
-   * with functions that allocate new memory (i.e., the hwloc_alloc*()
-   * functions).
-   * \hideinitializer */
-  HWLOC_MEMBIND_REPLICATE =	4,
-
   /** \brief For each page bound with this policy, by next time
    * it is touched (and next time only), it is moved from its current
    * location to the local NUMA node of the thread where the memory
    * reference occurred (if it needs to be moved at all).
    * \hideinitializer */
-  HWLOC_MEMBIND_NEXTTOUCH =	5,
+  HWLOC_MEMBIND_NEXTTOUCH =	4,
 
   /** \brief Returned by get_membind() functions when multiple
    * threads or parts of a memory area have differing memory binding
    * policies.
+   * Also returned when binding is unknown because binding hooks are empty
+   * when the topology is loaded from XML without HWLOC_THISSYSTEM=1, etc.
    * \hideinitializer */
   HWLOC_MEMBIND_MIXED = -1
 } hwloc_membind_policy_t;
@@ -1305,11 +1452,22 @@ typedef enum {
    * may fail with errno set to ENOSYS when used with NOCPUBIND.
    * \hideinitializer
    */
-  HWLOC_MEMBIND_NOCPUBIND =     (1<<4)
+  HWLOC_MEMBIND_NOCPUBIND =     (1<<4),
+
+  /** \brief Consider the bitmap argument as a nodeset.
+   *
+   * The bitmap argument is considered a nodeset if this flag is given,
+   * or a cpuset otherwise by default.
+   *
+   * Memory binding by CPU set cannot work for CPU-less NUMA memory nodes.
+   * Binding by nodeset should therefore be preferred whenever possible.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_BYNODESET =     (1<<5)
 } hwloc_membind_flags_t;
 
 /** \brief Set the default memory binding policy of the current
- * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ * process or thread to prefer the NUMA node(s) specified by \p set
  *
  * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
  * specified, the current process is assumed to be single-threaded.
@@ -1317,30 +1475,18 @@ typedef enum {
  * process-based OS functions or thread-based OS functions, depending
  * on which are available.
  *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
- */
-HWLOC_DECLSPEC int hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
-
-/** \brief Set the default memory binding policy of the current
- * process or thread to prefer the NUMA node(s) near the specified physical \p
- * cpuset
- *
- * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
- * specified, the current process is assumed to be single-threaded.
- * This is the most portable form as it permits hwloc to use either
- * process-based OS functions or thread-based OS functions, depending
- * on which are available.
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  *
  * \return -1 with errno set to ENOSYS if the action is not supported
  * \return -1 with errno set to EXDEV if the binding cannot be enforced
  */
-HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
 
 /** \brief Query the default memory binding policy and physical locality of the
  * current process or thread.
  *
- * This function has two output parameters: \p nodeset and \p policy.
+ * This function has two output parameters: \p set and \p policy.
  * The values returned in these parameters depend on both the \p flags
  * passed in and the current memory binding policies and nodesets in
  * the queried target.
@@ -1360,85 +1506,34 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpus
  * is also specified.  In this case, hwloc will check the default
  * memory policies and nodesets for all threads in the process.  If
  * they are not identical, -1 is returned and errno is set to EXDEV.
- * If they are identical, the values are returned in \p nodeset and \p
+ * If they are identical, the values are returned in \p set and \p
  * policy.
  *
  * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
- * ::HWLOC_MEMBIND_STRICT is \em not specified), \p nodeset is set to
- * the logical OR of all threads' default nodeset.  If all threads'
- * default policies are the same, \p policy is set to that policy.  If
- * they are different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
- *
- * In the ::HWLOC_MEMBIND_THREAD case (or when neither
- * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
- * is only one nodeset and policy; they are returned in \p nodeset and
- * \p policy, respectively.
- *
- * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
- */
-HWLOC_DECLSPEC int hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
-
-/** \brief Query the default memory binding policy and physical locality of the
- * current process or thread (the locality is returned in \p cpuset as
- * CPUs near the locality's actual NUMA node(s)).
- *
- * This function has two output parameters: \p cpuset and \p policy.
- * The values returned in these parameters depend on both the \p flags
- * passed in and the current memory binding policies and nodesets in
- * the queried target.
- *
- * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
- * target is the current policies and nodesets for all the threads in
- * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
- * the query target is the current policy and nodeset for only the
- * thread invoking this function.
- *
- * If neither of these flags are passed (which is the most portable
- * method), the process is assumed to be single threaded.  This allows
- * hwloc to use either process-based OS functions or thread-based OS
- * functions, depending on which are available.
- *
- * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
- * is also specified.  In this case, hwloc will check the default
- * memory policies and nodesets for all threads in the process.  If
- * they are not identical, -1 is returned and errno is set to EXDEV.
- * If they are identical, the policy is returned in \p policy.  \p
- * cpuset is set to the union of CPUs near the NUMA node(s) in the
- * nodeset.
- *
- * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
- * ::HWLOC_MEMBIND_STRICT is \em not specified), the default nodeset
- * from each thread is logically OR'ed together.  \p cpuset is set to
- * the union of CPUs near the NUMA node(s) in the resulting nodeset.
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default set
+ * from each thread is logically OR'ed together.
  * If all threads' default policies are the same, \p policy is set to
  * that policy.  If they are different, \p policy is set to
  * ::HWLOC_MEMBIND_MIXED.
  *
  * In the ::HWLOC_MEMBIND_THREAD case (or when neither
  * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
- * is only one nodeset and policy.  The policy is returned in \p
- * policy; \p cpuset is set to the union of CPUs near the NUMA node(s)
- * in the \p nodeset.
+ * is only one set and policy; they are returned in \p set and
+ * \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  *
  * If any other flags are specified, -1 is returned and errno is set
  * to EINVAL.
  */
-HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
 
 /** \brief Set the default memory binding policy of the specified
- * process to prefer the NUMA node(s) specified by physical \p nodeset
- *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ * process to prefer the NUMA node(s) specified by \p set
  *
- * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
- * and \p HANDLE on native Windows platforms.
- */
-HWLOC_DECLSPEC int hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
-
-/** \brief Set the default memory binding policy of the specified
- * process to prefer the NUMA node(s) near the specified physical \p cpuset
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  *
  * \return -1 with errno set to ENOSYS if the action is not supported
  * \return -1 with errno set to EXDEV if the binding cannot be enforced
@@ -1446,12 +1541,12 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwl
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  */
-HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
 
 /** \brief Query the default memory binding policy and physical locality of the
  * specified process.
  *
- * This function has two output parameters: \p nodeset and \p policy.
+ * This function has two output parameters: \p set and \p policy.
  * The values returned in these parameters depend on both the \p flags
  * passed in and the current memory binding policies and nodesets in
  * the queried target.
@@ -1471,82 +1566,40 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t
  * memory policies and nodesets for all threads in the specified
  * process.  If they are not identical, -1 is returned and errno is
  * set to EXDEV.  If they are identical, the values are returned in \p
- * nodeset and \p policy.
- *
- * Otherwise, \p nodeset is set to the logical OR of all threads'
- * default nodeset.  If all threads' default policies are the same, \p
- * policy is set to that policy.  If they are different, \p policy is
- * set to ::HWLOC_MEMBIND_MIXED.
- *
- * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
- *
- * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
- * and \p HANDLE on native Windows platforms.
- */
-HWLOC_DECLSPEC int hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
-
-/** \brief Query the default memory binding policy and physical locality of the
- * specified process (the locality is returned in \p cpuset as CPUs
- * near the locality's actual NUMA node(s)).
- *
- * This function has two output parameters: \p cpuset and \p policy.
- * The values returned in these parameters depend on both the \p flags
- * passed in and the current memory binding policies and nodesets in
- * the queried target.
+ * set and \p policy.
  *
- * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
- * target is the current policies and nodesets for all the threads in
- * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
- * (which is the most portable method), the process is assumed to be
- * single threaded.  This allows hwloc to use either process-based OS
- * functions or thread-based OS functions, depending on which are
- * available.
- *
- * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
- * this function.
- *
- * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
- * memory policies and nodesets for all threads in the specified
- * process.  If they are not identical, -1 is returned and errno is
- * set to EXDEV.  If they are identical, the policy is returned in \p
- * policy.  \p cpuset is set to the union of CPUs near the NUMA
- * node(s) in the nodeset.
- *
- * Otherwise, the default nodeset from each thread is logically OR'ed
- * together.  \p cpuset is set to the union of CPUs near the NUMA
- * node(s) in the resulting nodeset.  If all threads' default policies
+ * Otherwise, \p set is set to the logical OR of all threads'
+ * default set.  If all threads' default policies
  * are the same, \p policy is set to that policy.  If they are
  * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
  *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
  * If any other flags are specified, -1 is returned and errno is set
  * to EINVAL.
  *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  */
-HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
 
 /** \brief Bind the already-allocated memory identified by (addr, len)
- * to the NUMA node(s) in physical \p nodeset.
+ * to the NUMA node(s) specified by \p set.
  *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
- */
-HWLOC_DECLSPEC int hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
-
-/** \brief Bind the already-allocated memory identified by (addr, len)
- * to the NUMA node(s) near physical \p cpuset.
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  *
+ * \return 0 if \p len is 0.
  * \return -1 with errno set to ENOSYS if the action is not supported
  * \return -1 with errno set to EXDEV if the binding cannot be enforced
  */
-HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
 
-/** \brief Query the physical NUMA node(s) and binding policy of the memory
- * identified by (\p addr, \p len ).
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
  *
- * This function has two output parameters: \p nodeset and \p policy.
+ * This function has two output parameters: \p set and \p policy.
  * The values returned in these parameters depend on both the \p flags
  * passed in and the memory binding policies and nodesets of the pages
  * in the address range.
@@ -1554,44 +1607,44 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void
  * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
  * checked to see if they all have the same memory binding policy and
  * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
- * If they are identical across all pages, the nodeset and policy are
- * returned in \p nodeset and \p policy, respectively.
+ * If they are identical across all pages, the set and policy are
+ * returned in \p set and \p policy, respectively.
  *
- * If ::HWLOC_MEMBIND_STRICT is not specified, \p nodeset is set to the
- * union of all NUMA node(s) containing pages in the address range.
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated.
  * If all pages in the target have the same policy, it is returned in
  * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
  *
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
+ *
  * If any other flags are specified, -1 is returned and errno is set
  * to EINVAL.
+ *
+ * If \p len is 0, -1 is returned and errno is set to EINVAL.
  */
-HWLOC_DECLSPEC int hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
 
-/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
- * the memory identified by (\p addr, \p len ).
+/** \brief Get the NUMA nodes where memory identified by (\p addr, \p len ) is physically allocated.
  *
- * This function has two output parameters: \p cpuset and \p policy.
- * The values returned in these parameters depend on both the \p flags
- * passed in and the memory binding policies and nodesets of the pages
- * in the address range.
+ * Fills \p set according to the NUMA nodes where the memory area pages
+ * are physically allocated. If no page is actually allocated yet,
+ * \p set may be empty.
  *
- * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
- * checked to see if they all have the same memory binding policy and
- * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
- * If they are identical across all pages, the policy is returned in
- * \p policy.  \p cpuset is set to the union of CPUs near the NUMA
- * node(s) in the nodeset.
+ * If pages spread to multiple nodes, it is not specified whether they spread
+ * equitably, or whether most of them are on a single node, etc.
  *
- * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
- * node(s) containing pages in the address range is calculated.  \p
- * cpuset is then set to the CPUs near the NUMA node(s) in this union.
- * If all pages in the target have the same policy, it is returned in
- * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ * The operating system may move memory pages from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
  *
- * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
+ * If ::HWLOC_MEMBIND_BYNODESET is specified in \p flags, set is
+ * considered a nodeset. Otherwise it's a cpuset.
+ *
+ * If \p len is 0, \p set is emptied.
  */
-HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, int flags);
 
 /** \brief Allocate some memory
  *
@@ -1602,7 +1655,7 @@ HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void
  */
 HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
 
-/** \brief Allocate some memory on the given physical nodeset \p nodeset
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
  *
  * \return NULL with errno set to ENOSYS if the action is not supported
  * and ::HWLOC_MEMBIND_STRICT is given
@@ -1611,38 +1664,24 @@ HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
  * \return NULL with errno set to ENOMEM if the memory allocation failed
  * even before trying to bind.
  *
- * \note The allocated memory should be freed with hwloc_free().
- */
-HWLOC_DECLSPEC void *hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
-
-/** \brief Allocate some memory on memory nodes near the given physical cpuset \p cpuset
- *
- * \return NULL with errno set to ENOSYS if the action is not supported
- * and ::HWLOC_MEMBIND_STRICT is given
- * \return NULL with errno set to EXDEV if the binding cannot be enforced
- * and ::HWLOC_MEMBIND_STRICT is given
- * \return NULL with errno set to ENOMEM if the memory allocation failed
- * even before trying to bind.
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  *
  * \note The allocated memory should be freed with hwloc_free().
  */
-HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
 
-/** \brief Allocate some memory on the given nodeset \p nodeset
+/** \brief Allocate some memory on NUMA memory nodes specified by \p set
  *
- * This is similar to hwloc_alloc_membind except that it is allowed to change
+ * This is similar to hwloc_alloc_membind_nodeset() except that it is allowed to change
  * the current memory binding policy, thus providing more binding support, at
  * the expense of changing the current state.
- */
-static __hwloc_inline void *
-hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
-
-/** \brief Allocate some memory on the memory nodes near given cpuset \p cpuset
  *
- * This is similar to hwloc_alloc_membind_policy_nodeset, but for a given cpuset.
+ * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
+ * Otherwise it's a cpuset.
  */
 static __hwloc_inline void *
-hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
 
 /** \brief Free memory that was previously allocated by hwloc_alloc()
  * or hwloc_alloc_membind().
@@ -1664,8 +1703,6 @@ HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
  * file as if hwloc_topology_set_xml() had been called.
  * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
  * hwloc_topology_set_synthetic() had been called.
- * Setting HWLOC_FSROOT switches to reading the topology from the specified Linux
- * filesystem root.
  *
  * Finally, HWLOC_THISSYSTEM enforces the return value of
  * hwloc_topology_is_thissystem().
@@ -1673,7 +1710,7 @@ HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
  * @{
  */
 
-/** \brief Change which pid the topology is viewed from
+/** \brief Change which process the topology is viewed from.
  *
  * On some systems, processes may have different views of the machine, for
  * instance the set of allowed CPUs. By default, hwloc exposes the view from
@@ -1692,12 +1729,10 @@ HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topo
 /** \brief Enable synthetic topology.
  *
  * Gather topology information from the given \p description,
- * a space-separated string of numbers describing
- * the arity of each level.
- * Each number may be prefixed with a type and a colon to enforce the type
- * of a level.  If only some level types are enforced, hwloc will try to
- * choose the other types according to usual topologies, but it may fail
- * and you may have to specify more level types manually.
+ * a space-separated string of <type:number> describing
+ * the object type and arity at each level.
+ * All types may be omitted (space-separated string of numbers) so that
+ * hwloc chooses all types according to usual topologies.
  * See also the \ref synthetic.
  *
  * Setting the environment variable HWLOC_SYNTHETIC
@@ -1740,7 +1775,7 @@ HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restric
  *
  * \note For convenience, this backend provides empty binding hooks which just
  * return success.  To have hwloc still actually call OS-specific hooks, the
- * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
  * file is really the underlying system.
  *
  * \note On success, the XML component replaces the previously enabled
@@ -1768,7 +1803,7 @@ HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topo
  *
  * \note For convenience, this backend provides empty binding hooks which just
  * return success.  To have hwloc still actually call OS-specific hooks, the
- * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
  * file is really the underlying system.
  *
  * \note On success, the XML component replaces the previously enabled
@@ -1777,6 +1812,31 @@ HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topo
  */
 HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
 
+/** \brief Flags to be passed to hwloc_topology_set_components()
+ */
+enum hwloc_topology_components_flag_e {
+  /** \brief Blacklist the target component from being used.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST = (1UL<<0)
+};
+
+/** \brief Prevent a discovery component from being used for a topology.
+ *
+ * \p name is the name of the discovery component that should not be used
+ * when loading topology \p topology. The name is a string such as "cuda".
+ *
+ * For components with multiple phases, it may also be suffixed with the name
+ * of a phase, for instance "linux:io".
+ *
+ * \p flags should be ::HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST.
+ *
+ * This may be used to avoid expensive parts of the discovery process.
+ * For instance, CUDA-specific discovery may be expensive and unneeded
+ * while generic I/O discovery could still be useful.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_components(hwloc_topology_t __hwloc_restrict topology, unsigned long flags, const char * __hwloc_restrict name);
+
 /** @} */
 
 
@@ -1796,22 +1856,32 @@ HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restric
  * They may also be returned by hwloc_topology_get_flags().
  */
 enum hwloc_topology_flags_e {
- /** \brief Detect the whole system, ignore reservations.
+ /** \brief Detect the whole system, ignore reservations, include disallowed objects.
    *
    * Gather all resources, even if some were disabled by the administrator.
    * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
    *
-   * When this flag is set, each object has allowed_cpuset <= cpuset <= complete_cpuset.
-   * Otherwise allowed_cpuset = cpuset <= complete_cpuset.
-   * The same applies to nodesets.
+   * When this flag is not set, PUs and NUMA nodes that are disallowed are not added to the topology.
+   * Parent objects (package, core, cache, etc.) are added only if some of their children are allowed.
+   * All existing PUs and NUMA nodes in the topology are allowed.
+   * hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset()
+   * are equal to the root object cpuset and nodeset.
+   *
+   * When this flag is set, the actual sets of allowed PUs and NUMA nodes are given
+   * by hwloc_topology_get_allowed_cpuset() and hwloc_topology_get_allowed_nodeset().
+   * They may be smaller than the root object cpuset and nodeset.
+   *
+   * If the current topology is exported to XML and reimported later, this flag
+   * should be set again in the reimported topology so that disallowed resources
+   * are reimported as well.
    * \hideinitializer
    */
-  HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM = (1UL<<0),
+  HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED = (1UL<<0),
 
  /** \brief Assume that the selected backend provides the topology for the
    * system on which we are running.
    *
-   * This forces hwloc_topology_is_thissystem to return 1, i.e. makes hwloc assume that
+   * This forces hwloc_topology_is_thissystem() to return 1, i.e. makes hwloc assume that
    * the selected backend provides the topology for the system on which we are running,
    * even if it is not the OS-specific backend but the XML backend for instance.
    * This means making the binding functions actually call the OS-specific
@@ -1828,45 +1898,26 @@ enum hwloc_topology_flags_e {
    */
   HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
 
-  /** \brief Detect PCI devices.
+ /** \brief Get the set of allowed resources from the local operating system even if the topology was loaded from XML or synthetic description.
    *
-   * By default, I/O devices are ignored. This flag enables I/O device
-   * detection using the pci backend. Only the common PCI devices (GPUs,
-   * NICs, block devices, ...) and host bridges (objects that connect the host
-   * objects to an I/O subsystem) will be added to the topology.
-   * Additionally it also enables MemoryDevice misc objects.
-   * Uncommon devices and other bridges (such as PCI-to-PCI bridges) will be
-   * ignored.
-   * \hideinitializer
-   */
-  HWLOC_TOPOLOGY_FLAG_IO_DEVICES = (1UL<<2),
-
-  /** \brief Detect PCI bridges.
+   * If the topology was loaded from XML or from a synthetic string,
+   * restrict it by applying the current process restrictions such as
+   * Linux Cgroup/Cpuset.
    *
-   * This flag should be combined with HWLOC_TOPOLOGY_FLAG_IO_DEVICES to enable
-   * the detection of both common devices and of all useful bridges (bridges that
-   * have at least one device behind them).
-   * \hideinitializer
-   */
-  HWLOC_TOPOLOGY_FLAG_IO_BRIDGES = (1UL<<3),
-
-  /** \brief Detect the whole PCI hierarchy.
+   * This is useful when the topology is not loaded directly from
+   * the local machine (e.g. for performance reason) and it comes
+   * with all resources, while the running process is restricted
+   * to only parts of the machine.
    *
-   * This flag enables detection of all I/O devices (even the uncommon ones)
-   * and bridges (even those that have no device behind them) using the pci
-   * backend.
-   * This implies HWLOC_TOPOLOGY_FLAG_IO_DEVICES.
-   * \hideinitializer
-   */
-  HWLOC_TOPOLOGY_FLAG_WHOLE_IO = (1UL<<4),
-
-  /** \brief Detect instruction caches.
+   * This flag is ignored unless ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM is
+   * also set since the loaded topology must match the underlying machine
+   * where restrictions will be gathered from.
    *
-   * This flag enables detection of Instruction caches,
-   * instead of only Data and Unified caches.
+   * Setting the environment variable HWLOC_THISSYSTEM_ALLOWED_RESOURCES
+   * would result in the same behavior.
    * \hideinitializer
    */
-  HWLOC_TOPOLOGY_FLAG_ICACHES = (1UL<<5)
+  HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES = (1UL<<2)
 };
 
 /** \brief Set OR'ed flags to non-yet-loaded topology.
@@ -1901,9 +1952,21 @@ HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restri
 struct hwloc_topology_discovery_support {
   /** \brief Detecting the number of PU objects is supported. */
   unsigned char pu;
+  /** \brief Detecting the number of NUMA nodes is supported. */
+  unsigned char numa;
+  /** \brief Detecting the amount of memory in NUMA nodes is supported. */
+  unsigned char numa_memory;
+  /** \brief Detecting and identifying PU objects that are not available to the current process is supported. */
+  unsigned char disallowed_pu;
+  /** \brief Detecting and identifying NUMA nodes that are not available to the current process is supported. */
+  unsigned char disallowed_numa;
 };
 
-/** \brief Flags describing actual PU binding support for this topology. */
+/** \brief Flags describing actual PU binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
 struct hwloc_topology_cpubind_support {
   /** Binding the whole current process is supported.  */
   unsigned char set_thisproc_cpubind;
@@ -1929,7 +1992,11 @@ struct hwloc_topology_cpubind_support {
   unsigned char get_thisthread_last_cpu_location;
 };
 
-/** \brief Flags describing actual memory binding support for this topology. */
+/** \brief Flags describing actual memory binding support for this topology.
+ *
+ * A flag may be set even if the feature isn't supported in all cases
+ * (e.g. binding to random sets of non-contiguous objects).
+ */
 struct hwloc_topology_membind_support {
   /** Binding the whole current process is supported.  */
   unsigned char set_thisproc_membind;
@@ -1955,13 +2022,12 @@ struct hwloc_topology_membind_support {
   unsigned char bind_membind;
   /** Interleave policy is supported. */
   unsigned char interleave_membind;
-  /** Replication policy is supported. */
-  unsigned char replicate_membind;
   /** Next-touch migration policy is supported. */
   unsigned char nexttouch_membind;
-
   /** Migration flags is supported. */
   unsigned char migrate_membind;
+  /** Getting the last NUMA nodes where a memory area was allocated is supported */
+  unsigned char get_area_memlocation;
 };
 
 /** \brief Set of flags describing actual support for this topology.
@@ -1976,63 +2042,101 @@ struct hwloc_topology_support {
   struct hwloc_topology_membind_support *membind;
 };
 
-/** \brief Retrieve the topology support. */
+/** \brief Retrieve the topology support.
+ *
+ * Each flag indicates whether a feature is supported.
+ * If set to 0, the feature is not supported.
+ * If set to 1, the feature is supported, but the corresponding
+ * call may still fail in some corner cases.
+ *
+ * These features are also listed by hwloc-info \--support
+ */
 HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
 
-/** \brief Ignore an object type.
+/** \brief Type filtering flags.
  *
- * Ignore all objects from the given type.
- * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
- * The top-level object of the hierarchy will never be ignored, even if this function
- * succeeds.
- * I/O objects may not be ignored, topology flags should be used to configure
- * their discovery instead.
+ * By default, most objects are kept (::HWLOC_TYPE_FILTER_KEEP_ALL).
+ * Instruction caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ * Die and Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+ *
+ * Note that group objects are also ignored individually (without the entire level)
+ * when they do not bring structure.
  */
-HWLOC_DECLSPEC int hwloc_topology_ignore_type(hwloc_topology_t topology, hwloc_obj_type_t type);
+enum hwloc_type_filter_e {
+  /** \brief Keep all objects of this type.
+   *
+   * Cannot be set for ::HWLOC_OBJ_GROUP (groups are designed only to add more structure to the topology).
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_ALL = 0,
 
-/** \brief Ignore an object type if it does not bring any structure.
- *
- * Ignore all objects from the given type as long as they do not bring any structure:
- * Each ignored object should have a single children or be the only child of its parent.
- * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
- * I/O objects may not be ignored, topology flags should be used to configure
- * their discovery instead.
- * Group objects are always ignored if they do not bring any structure
- * since they are designed to add structure to the topology.
- * Misc objects cannot be ignored based on the structure since they are only annotations
- * outside of the main topology structure.
+  /** \brief Ignore all objects of this type.
+   *
+   * The bottom-level type ::HWLOC_OBJ_PU, the ::HWLOC_OBJ_NUMANODE type, and
+   * the top-level type ::HWLOC_OBJ_MACHINE may not be ignored.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_NONE = 1,
+
+  /** \brief Only ignore objects if their entire level does not bring any structure.
+   *
+   * Keep the entire level of objects if at least one of these objects adds
+   * structure to the topology. An object brings structure when it has multiple
+   * children and it is not the only child of its parent.
+   *
+   * If all objects in the level are the only child of their parent, and if none
+   * of them has multiple children, the entire level is removed.
+   *
+   * Cannot be set for I/O and Misc objects since the topology structure does not matter there.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_STRUCTURE = 2,
+
+  /** \brief Only keep likely-important objects of the given type.
+   *
+   * It is only useful for I/O object types.
+   * For ::HWLOC_OBJ_PCI_DEVICE and ::HWLOC_OBJ_OS_DEVICE, it means that only objects
+   * of major/common kinds are kept (storage, network, OpenFabrics, Intel MICs, CUDA,
+   * OpenCL, NVML, and displays).
+   * Also, only OS devices directly attached on PCI (e.g. no USB) are reported.
+   * For ::HWLOC_OBJ_BRIDGE, it means that bridges are kept only if they have children.
+   *
+   * This flag equivalent to ::HWLOC_TYPE_FILTER_KEEP_ALL for Normal, Memory and Misc types
+   * since they are likely important.
+   * \hideinitializer
+   */
+  HWLOC_TYPE_FILTER_KEEP_IMPORTANT = 3
+};
+
+/** \brief Set the filtering for the given object type.
  */
-HWLOC_DECLSPEC int hwloc_topology_ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_type_t type);
+HWLOC_DECLSPEC int hwloc_topology_set_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter);
 
-/** \brief Ignore all objects that do not bring any structure.
- *
- * Ignore all objects that do not bring any structure:
- * Each ignored object should have a single children or be the only child of its parent.
- * I/O objects may not be ignored, topology flags should be used to configure
- * their discovery instead.
+/** \brief Get the current filtering for the given object type.
  */
-HWLOC_DECLSPEC int hwloc_topology_ignore_all_keep_structure(hwloc_topology_t topology);
+HWLOC_DECLSPEC int hwloc_topology_get_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filter);
 
-/** \brief Provide a distance matrix.
+/** \brief Set the filtering for all object types.
  *
- * Provide the matrix of distances between a set of objects of the given type.
- * The set may or may not contain all the existing objects of this type.
- * The objects are specified by their OS/physical index in the \p os_index
- * array. The \p distances matrix follows the same order.
- * The distance from object i to object j in the i*nbobjs+j.
+ * If some types do not support this filtering, they are silently ignored.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_all_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all CPU cache object types.
  *
- * A single latency matrix may be defined for each type.
- * If another distance matrix already exists for the given type,
- * either because the user specified it or because the OS offers it,
- * it will be replaced by the given one.
- * If \p nbobjs is \c 0, \p os_index is \c NULL and \p distances is \c NULL,
- * the existing distance matrix for the given type is removed.
+ * Memory-side caches are not involved since they are not CPU caches.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all CPU instruction cache object types.
  *
- * \note Distance matrices are ignored in multi-node topologies.
+ * Memory-side caches are not involved since they are not CPU caches.
  */
-HWLOC_DECLSPEC int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology,
-						      hwloc_obj_type_t type, unsigned nbobjs,
-						      unsigned *os_index, float *distances);
+HWLOC_DECLSPEC int hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
+
+/** \brief Set the filtering for all I/O object types.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
 
 /** \brief Set the topology-specific userdata pointer.
  *
@@ -2063,47 +2167,102 @@ HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
 
 /** \brief Flags to be given to hwloc_topology_restrict(). */
 enum hwloc_restrict_flags_e {
-  /** \brief Adapt distance matrices according to objects being removed during restriction.
-   * If this flag is not set, distance matrices are removed.
+  /** \brief Remove all objects that became CPU-less.
+   * By default, only objects that contain no PU and no memory are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_REMOVE_CPULESS = (1UL<<0),
+
+  /** \brief Restrict by nodeset instead of CPU set.
+   * Only keep objects whose nodeset is included or partially included in the given set.
+   * This flag may not be used with ::HWLOC_RESTRICT_FLAG_BYNODESET.
+   */
+  HWLOC_RESTRICT_FLAG_BYNODESET =  (1UL<<3),
+
+  /** \brief Remove all objects that became Memory-less.
+   * By default, only objects that contain no PU and no memory are removed.
+   * This flag may only be used with ::HWLOC_RESTRICT_FLAG_BYNODESET.
    * \hideinitializer
    */
-  HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES = (1<<0),
+  HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS = (1UL<<4),
 
   /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
    * If this flag is not set, Misc objects are removed when their parents are removed.
    * \hideinitializer
    */
-  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1<<1),
+  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1UL<<1),
 
   /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
    * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
    * \hideinitializer
    */
-  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1<<2)
+  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1UL<<2)
 };
 
-/** \brief Restrict the topology to the given CPU set.
+/** \brief Restrict the topology to the given CPU set or nodeset.
  *
  * Topology \p topology is modified so as to remove all objects that
- * are not included (or partially included) in the CPU set \p cpuset.
+ * are not included (or partially included) in the CPU set \p set.
  * All objects CPU and node sets are restricted accordingly.
  *
+ * If ::HWLOC_RESTRICT_FLAG_BYNODESET is passed in \p flags,
+ * \p set is considered a nodeset instead of a CPU set.
+ *
  * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
  *
  * \note This call may not be reverted by restricting back to a larger
- * cpuset. Once dropped during restriction, objects may not be brought
+ * set. Once dropped during restriction, objects may not be brought
  * back, except by loading another topology with hwloc_topology_load().
  *
  * \return 0 on success.
  *
- * \return -1 with errno set to EINVAL if the input cpuset is invalid.
+ * \return -1 with errno set to EINVAL if the input set is invalid.
  * The topology is not modified in this case.
  *
  * \return -1 with errno set to ENOMEM on failure to allocate internal data.
  * The topology is reinitialized in this case. It should be either
  * destroyed with hwloc_topology_destroy() or configured and loaded again.
  */
-HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, unsigned long flags);
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_bitmap_t set, unsigned long flags);
+
+/** \brief Flags to be given to hwloc_topology_allow(). */
+enum hwloc_allow_flags_e {
+  /** \brief Mark all objects as allowed in the topology.
+   *
+   * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_ALL = (1UL<<0),
+
+  /** \brief Only allow objects that are available to the current process.
+   *
+   * The topology must have ::HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM so that the set
+   * of available resources can actually be retrieved from the operating system.
+   *
+   * \p cpuset and \p nođeset given to hwloc_topology_allow() must be \c NULL.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS = (1UL<<1),
+
+  /** \brief Allow a custom set of objects, given to hwloc_topology_allow() as \p cpuset and/or \p nodeset parameters.
+   * \hideinitializer */
+  HWLOC_ALLOW_FLAG_CUSTOM = (1UL<<2)
+};
+
+/** \brief Change the sets of allowed PUs and NUMA nodes in the topology.
+ *
+ * This function only works if the ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
+ * was set on the topology. It does not modify any object, it only changes
+ * the sets returned by hwloc_topology_get_allowed_cpuset() and
+ * hwloc_topology_get_allowed_nodeset().
+ *
+ * It is notably useful when importing a topology from another process
+ * running in a different Linux Cgroup.
+ *
+ * \p flags must be set to one flag among ::hwloc_allow_flags_e.
+ *
+ * \note Removing objects from a topology should rather be performed with
+ * hwloc_topology_restrict().
+ */
+HWLOC_DECLSPEC int hwloc_topology_allow(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, hwloc_const_nodeset_t nodeset, unsigned long flags);
 
 /** \brief Add a MISC object as a leaf of the topology
  *
@@ -2112,11 +2271,17 @@ HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict top
  * without ever adding any intermediate hierarchy level. This is useful for
  * annotating the topology without actually changing the hierarchy.
  *
- * \p name will be copied to the setup the new object attributes.
- * However, the new leaf object will not have any \p cpuset.
+ * \p name is supposed to be unique across all Misc objects in the topology.
+ * It will be duplicated to setup the new object attributes.
+ *
+ * The new leaf object will not have any \p cpuset.
  *
  * \return the newly-created object
  *
+ * \return \c NULL on error.
+ *
+ * \return \c NULL if Misc objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ *
  * \note If \p name contains some non-printable characters, they will
  * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
  */
@@ -2128,9 +2293,13 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t to
  * The caller should (at least) initialize its sets before inserting the object.
  * See hwloc_topology_insert_group_object().
  *
- * Custom name/value info pairs may be added with hwloc_obj_add_info() after
- * insertion. For instance the Type info key allows to display something else
+ * The \p subtype object attribute may be set to display something else
  * than "Group" as the type name for this object in lstopo.
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion.
+ *
+ * The \p kind group attribute should be 0. The \p subkind group attribute may
+ * be set to identify multiple Groups of the same level.
  *
  * It is recommended not to set any other object attribute before insertion,
  * since the Group may get discarded during insertion.
@@ -2143,15 +2312,25 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t to
 /** \brief Add more structure to the topology by adding an intermediate Group
  *
  * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
- * Then it must initialize some of its sets to specify the final location of the Group
- * in the topology.
+ * Then it must setup at least one of its CPU or node sets to specify
+ * the final location of the Group in the topology.
  * Then the object can be passed to this function for actual insertion in the topology.
  *
- * Either the cpuset or nodeset field (or both, if compatible) may be used to do so.
- * If inserting with respect to the complete topology (including disallowed, offline
- * or unknown object), complete_cpuset and/or complete_nodeset may be used instead.
- * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way to
- * build the Group sets iteratively.
+ * The group \p dont_merge attribute may be set to prevent the core from
+ * ever merging this object with another object hierarchically-identical.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) must be set
+ * to a non-empty bitmap. The complete_cpuset or complete_nodeset may be set
+ * instead if inserting with respect to the complete topology
+ * (including disallowed, offline or unknown objects).
+ *
+ * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way
+ * to build the Group sets iteratively.
+ *
+ * These sets cannot be larger than the current topology, or they would get
+ * restricted silently.
+ *
+ * The core will setup the other sets after actual insertion.
  *
  * \return The inserted object if it was properly inserted.
  *
@@ -2161,7 +2340,7 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t to
  *
  * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
  *
- * \return \c NULL if Group objects are always ignored in the topology.
+ * \return \c NULL if Group objects are filtered-out of the topology (::HWLOC_TYPE_FILTER_KEEP_NONE).
  *
  * \return \c NULL if the object was discarded because no set was initialized in the Group
  * before insert, or all of them were empty.
@@ -2189,18 +2368,21 @@ HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src
 
 
 /* high-level helpers */
-#include <hwloc/helper.h>
+#include "hwloc/helper.h"
 
 /* inline code of some functions above */
-#include <hwloc/inlines.h>
+#include "hwloc/inlines.h"
 
 /* exporting to XML or synthetic */
-#include <hwloc/export.h>
+#include "hwloc/export.h"
+
+/* distances */
+#include "hwloc/distances.h"
 
 /* topology diffs */
-#include <hwloc/diff.h>
+#include "hwloc/diff.h"
 
 /* deprecated headers */
-#include <hwloc/deprecated.h>
+#include "hwloc/deprecated.h"
 
 #endif /* HWLOC_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h b/ext/hwloc/include/hwloc/autogen/config.h
index 3c243ed14..9e8eb1410 100644
--- a/ext/hwloc/include/hwloc/autogen/config.h
+++ b/ext/hwloc/include/hwloc/autogen/config.h
@@ -1,7 +1,7 @@
 /* include/hwloc/autogen/config.h.  Generated from config.h.in by configure.  */
 /* -*- c -*-
  * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -68,6 +68,13 @@
 #define GCC_ABOVE_3_3 0
 #endif
 
+#if !defined(__cplusplus) &&					\
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#define GCC_ABOVE_3_4 1
+#else
+#define GCC_ABOVE_3_4 0
+#endif
+
 /* Maybe before gcc 2.95 too */
 #ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
 #define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
@@ -121,6 +128,7 @@
 # define __hwloc_attribute_pure
 #endif
 
+#ifndef __hwloc_attribute_deprecated /* allow the user to disable these warnings by defining this macro to nothing */
 #ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
 #define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
 #elif defined(__GNUC__)
@@ -133,6 +141,7 @@
 #else
 # define __hwloc_attribute_deprecated
 #endif
+#endif
 
 #ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
 #define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
@@ -147,6 +156,19 @@
 # define __hwloc_attribute_may_alias
 #endif
 
+#ifdef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+#define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT (GXX_ABOVE_3_4 || GCC_ABOVE_3_4)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT
+# define __hwloc_attribute_warn_unused_result __attribute__((__warn_unused_result__))
+#else
+# define __hwloc_attribute_warn_unused_result
+#endif
+
 #ifdef HWLOC_C_HAVE_VISIBILITY
 # if HWLOC_C_HAVE_VISIBILITY
 #  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
diff --git a/ext/hwloc/include/hwloc/bitmap.h b/ext/hwloc/include/hwloc/bitmap.h
index bb18f6504..d5b0ea020 100644
--- a/ext/hwloc/include/hwloc/bitmap.h
+++ b/ext/hwloc/include/hwloc/bitmap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -13,7 +13,8 @@
 #ifndef HWLOC_BITMAP_H
 #define HWLOC_BITMAP_H
 
-#include <hwloc/autogen/config.h>
+#include "hwloc/autogen/config.h"
+
 #include <assert.h>
 
 
@@ -24,21 +25,38 @@ extern "C" {
 
 /** \defgroup hwlocality_bitmap The bitmap API
  *
- * The ::hwloc_bitmap_t type represents a set of objects, typically OS
- * processors -- which may actually be hardware threads (represented
- * by ::hwloc_cpuset_t, which is a typedef for ::hwloc_bitmap_t) -- or
- * memory nodes (represented by ::hwloc_nodeset_t, which is also a
- * typedef for ::hwloc_bitmap_t).
- *
- * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ * The ::hwloc_bitmap_t type represents a set of integers (positive or null).
+ * A bitmap may be of infinite size (all bits are set after some point).
+ * A bitmap may even be full if all bits are set.
  *
- * \note CPU sets and nodesets are described in \ref hwlocality_object_sets.
+ * Bitmaps are used by hwloc for sets of OS processors
+ * (which may actually be hardware threads) as by ::hwloc_cpuset_t
+ * (a typedef for ::hwloc_bitmap_t), or sets of NUMA memory nodes
+ * as ::hwloc_nodeset_t (also a typedef for ::hwloc_bitmap_t).
+ * Those are used for cpuset and nodeset fields in the ::hwloc_obj structure,
+ * see \ref hwlocality_object_sets.
  *
- * A bitmap may be of infinite size.
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ * However users should usually not build CPU and node sets manually
+ * (e.g. with hwloc_bitmap_set()).
+ * One should rather use existing object sets and combine them with
+ * hwloc_bitmap_or(), etc.
+ * For instance, binding the current thread on a pair of cores may be performed with:
+ * \code
+ * hwloc_obj_t core1 = ... , core2 = ... ;
+ * hwloc_bitmap_t set = hwloc_bitmap_alloc();
+ * hwloc_bitmap_or(set, core1->cpuset, core2->cpuset);
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD);
+ * hwloc_bitmap_free(set);
+ * \endcode
+ *
+ * \note Most functions below return an int that may be negative in case of
+ * error. The usual error case would be an internal failure to realloc/extend
+ * the storage of the bitmap (\p errno would be set to \c ENOMEM).
  *
  * \note Several examples of using the bitmap API are available under the
  * doc/examples/ directory in the source tree.
- * Regression tests such as tests/hwloc_bitmap*.c also make intensive use
+ * Regression tests such as tests/hwloc/hwloc_bitmap*.c also make intensive use
  * of this API.
  * @{
  */
@@ -81,7 +99,7 @@ HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
 HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
 
 /** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
-HWLOC_DECLSPEC void hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
 
 
 /*
@@ -100,6 +118,8 @@ HWLOC_DECLSPEC void hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t s
 HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
 
 /** \brief Stringify a bitmap into a newly allocated string.
+ *
+ * \return -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
@@ -111,7 +131,7 @@ HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwl
  *
  * Lists are comma-separated indexes or ranges.
  * Ranges are dash separated indexes.
- * The last range may not have a ending indexes if the bitmap is infinite.
+ * The last range may not have an ending indexes if the bitmap is infinitely set.
  *
  * Up to \p buflen characters may be written in buffer \p buf.
  *
@@ -123,6 +143,8 @@ HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwl
 HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
 
 /** \brief Stringify a bitmap into a newly allocated list string.
+ *
+ * \return -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
@@ -145,6 +167,8 @@ HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char *
 HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
 
 /** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ *
+ * \return -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
@@ -164,16 +188,19 @@ HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
 HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
 
 /** \brief Empty the bitmap \p bitmap and add bit \p id */
-HWLOC_DECLSPEC void hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+HWLOC_DECLSPEC int hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
 
 /** \brief Fill the bitmap \p and clear the index \p id */
-HWLOC_DECLSPEC void hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+HWLOC_DECLSPEC int hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
 
 /** \brief Setup bitmap \p bitmap from unsigned long \p mask */
-HWLOC_DECLSPEC void hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+HWLOC_DECLSPEC int hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
 
 /** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
-HWLOC_DECLSPEC void hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+HWLOC_DECLSPEC int hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned longs \p masks used as first \p nr subsets */
+HWLOC_DECLSPEC int hwloc_bitmap_from_ulongs(hwloc_bitmap_t bitmap, unsigned nr, const unsigned long *masks);
 
 
 /*
@@ -181,33 +208,46 @@ HWLOC_DECLSPEC void hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned
  */
 
 /** \brief Add index \p id in bitmap \p bitmap */
-HWLOC_DECLSPEC void hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+HWLOC_DECLSPEC int hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
 
 /** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
  *
  * If \p end is \c -1, the range is infinite.
  */
-HWLOC_DECLSPEC void hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+HWLOC_DECLSPEC int hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
 
 /** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
-HWLOC_DECLSPEC void hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+HWLOC_DECLSPEC int hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
 
 /** \brief Remove index \p id from bitmap \p bitmap */
-HWLOC_DECLSPEC void hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+HWLOC_DECLSPEC int hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
 
 /** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
  *
  * If \p end is \c -1, the range is infinite.
  */
-HWLOC_DECLSPEC void hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+HWLOC_DECLSPEC int hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
 
 /** \brief Keep a single index among those set in bitmap \p bitmap
  *
  * May be useful before binding so that the process does not
  * have a chance of migrating between multiple logical CPUs
  * in the original mask.
+ * Instead of running the task on any PU inside the given CPU set,
+ * the operating system scheduler will be forced to run it on a single
+ * of these PUs.
+ * It avoids a migration overhead and cache-line ping-pongs between PUs.
+ *
+ * \note This function is NOT meant to distribute multiple processes
+ * within a single CPU set. It always return the same single bit when
+ * called multiple times on the same input set. hwloc_distrib() may
+ * be used for generating CPU sets to distribute multiple tasks below
+ * a single multi-PU object.
+ *
+ * \note This function cannot be applied to an object set directly. It
+ * should be applied to a copy (which may be obtained with hwloc_bitmap_dup()).
  */
-HWLOC_DECLSPEC void hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+HWLOC_DECLSPEC int hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
 
 
 /*
@@ -220,18 +260,52 @@ HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap)
 /** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
 HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
 
-/** \brief Test whether index \p id is part of bitmap \p bitmap */
+/** \brief Convert the first \p nr subsets of bitmap \p bitmap into the array of \p nr unsigned long \p masks
+ *
+ * \p nr may be determined earlier with hwloc_bitmap_nr_ulongs().
+ *
+ * \return 0
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_to_ulongs(hwloc_const_bitmap_t bitmap, unsigned nr, unsigned long *masks);
+
+/** \brief Return the number of unsigned longs required for storing bitmap \p bitmap entirely
+ *
+ * This is the number of contiguous unsigned longs from the very first bit of the bitmap
+ * (even if unset) up to the last set bit.
+ * This is useful for knowing the \p nr parameter to pass to hwloc_bitmap_to_ulongs()
+ * (or which calls to hwloc_bitmap_to_ith_ulong() are needed)
+ * to entirely convert a bitmap into multiple unsigned longs.
+ *
+ * When called on the output of hwloc_topology_get_topology_cpuset(),
+ * the returned number is large enough for all cpusets of the topology.
+ *
+ * \return -1 if \p bitmap is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_nr_ulongs(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap.
+ *
+ * \return 1 if the bit at index \p id is set in bitmap \p bitmap, 0 otherwise.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
 
-/** \brief Test whether bitmap \p bitmap is empty */
+/** \brief Test whether bitmap \p bitmap is empty
+ *
+ * \return 1 if bitmap is empty, 0 otherwise.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
-/** \brief Test whether bitmap \p bitmap is completely full */
+/** \brief Test whether bitmap \p bitmap is completely full
+ *
+ * \return 1 if bitmap is full, 0 otherwise.
+ *
+ * \note A full bitmap is always infinitely set.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
 /** \brief Compute the first index (least significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is set.
+ * \return -1 if no index is set in \p bitmap.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
@@ -239,13 +313,13 @@ HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attri
  *
  * If \p prev is -1, the first index is returned.
  *
- * \return -1 if no index with higher index is bitmap.
+ * \return -1 if no index with higher index is set in \p bitmap.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
 
 /** \brief Compute the last index (most significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is bitmap, or if the index bitmap is infinite.
+ * \return -1 if no index is set in \p bitmap, or if \p bitmap is infinitely set.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
@@ -253,31 +327,61 @@ HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attrib
  * indexes that are in the bitmap).
  *
  * \return the number of indexes that are in the bitmap.
+ *
+ * \return -1 if \p bitmap is infinitely set.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
+/** \brief Compute the first unset index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next unset index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first unset index is returned.
+ *
+ * \return -1 if no index with higher index is unset in \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next_unset(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last unset index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is unset in \p bitmap, or if \p bitmap is infinitely set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
 /** \brief Loop macro iterating on bitmap \p bitmap
- * \hideinitializer
+ *
+ * The loop must start with hwloc_bitmap_foreach_begin() and end
+ * with hwloc_bitmap_foreach_end() followed by a terminating ';'.
  *
  * \p index is the loop variable; it should be an unsigned int.  The
  * first iteration will set \p index to the lowest index in the bitmap.
  * Successive iterations will iterate through, in order, all remaining
- * indexes that in the bitmap.  To be specific: each iteration will return a
+ * indexes set in the bitmap.  To be specific: each iteration will return a
  * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
  *
- * The assert prevents the loop from being infinite if the bitmap is infinite.
+ * The assert prevents the loop from being infinite if the bitmap is infinitely set.
+ *
+ * \hideinitializer
  */
 #define hwloc_bitmap_foreach_begin(id, bitmap) \
 do { \
         assert(hwloc_bitmap_weight(bitmap) != -1); \
         for (id = hwloc_bitmap_first(bitmap); \
              (unsigned) id != (unsigned) -1; \
-             id = hwloc_bitmap_next(bitmap, id)) { \
-/** \brief End of loop. Needs a terminating ';'.
- * \hideinitializer
+             id = hwloc_bitmap_next(bitmap, id)) {
+
+/** \brief End of loop macro iterating on a bitmap.
+ *
+ * Needs a terminating ';'.
  *
- * \sa hwloc_bitmap_foreach_begin */
-#define hwloc_bitmap_foreach_end() \
+ * \sa hwloc_bitmap_foreach_begin()
+ * \hideinitializer
+ */
+#define hwloc_bitmap_foreach_end()		\
         } \
 } while (0)
 
@@ -290,50 +394,73 @@ do { \
  *
  * \p res can be the same as \p bitmap1 or \p bitmap2
  */
-HWLOC_DECLSPEC void hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+HWLOC_DECLSPEC int hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
 
 /** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
  *
  * \p res can be the same as \p bitmap1 or \p bitmap2
  */
-HWLOC_DECLSPEC void hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+HWLOC_DECLSPEC int hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
 
 /** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
  *
  * \p res can be the same as \p bitmap1 or \p bitmap2
  */
-HWLOC_DECLSPEC void hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+HWLOC_DECLSPEC int hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
 
 /** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
  *
  * \p res can be the same as \p bitmap1 or \p bitmap2
  */
-HWLOC_DECLSPEC void hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+HWLOC_DECLSPEC int hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
 
 /** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
  *
  * \p res can be the same as \p bitmap
  */
-HWLOC_DECLSPEC void hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+HWLOC_DECLSPEC int hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
 
 
 /*
  * Comparing bitmaps.
  */
 
-/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects */
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects.
+ *
+ * \return 1 if bitmaps intersect, 0 otherwise.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
 
-/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap */
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap.
+ *
+ * \return 1 if \p sub_bitmap is included in \p super_bitmap, 0 otherwise.
+ *
+ * \note The empty bitmap is considered included in any other bitmap.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
 
-/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2 */
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2.
+ *
+ * \return 1 if bitmaps are equal, 0 otherwise.
+ */
 HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
 
 /** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
  *
- * Smaller least significant bit is smaller.
- * The empty bitmap is considered higher than anything.
+ * A bitmap is considered smaller if its least significant bit is smaller.
+ * The empty bitmap is considered higher than anything (because its least significant bit does not exist).
+ *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110)
+ * because least significant bit of 0011 (0001) is smaller than least significant bit of 0110 (0010).
+ * Comparing 01001 and 00110 would also return -1 for the same reason.
+ *
+ * \return 0 if bitmaps are considered equal, even if they are not strictly equal.
+ * They just need to have the same least significant bit.
+ * For instance, comparing binary bitmaps 0010 and 0110 returns 0 because they have the same least significant bit.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
 
@@ -343,6 +470,14 @@ HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwlo
  * Compare last indexes first, then second, etc.
  * The empty bitmap is considered lower than anything.
  *
+ * \return -1 if \p bitmap1 is considered smaller than \p bitmap2.
+ * \return 1 if \p bitmap1 is considered larger than \p bitmap2.
+ * \return 0 if bitmaps are equal (contrary to hwloc_bitmap_compare_first()).
+ *
+ * For instance comparing binary bitmaps 0011 and 0110 returns -1
+ * (hence 0011 is considered smaller than 0110).
+ * Comparing 00101 and 01010 returns -1 too.
+ *
  * \note This is different from the non-existing hwloc_bitmap_compare_last()
  * which would only compare the highest index of each bitmap.
  */
diff --git a/ext/hwloc/include/hwloc/cuda.h b/ext/hwloc/include/hwloc/cuda.h
index a02d67769..6f0cda4cd 100644
--- a/ext/hwloc/include/hwloc/cuda.h
+++ b/ext/hwloc/include/hwloc/cuda.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2017 Inria.  All rights reserved.
  * Copyright © 2010-2011 Université Bordeaux
  * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -16,11 +16,11 @@
 #ifndef HWLOC_CUDA_H
 #define HWLOC_CUDA_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc/helper.h"
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
 #endif
 
 #include <cuda.h>
@@ -96,7 +96,6 @@ hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
   /* If we're on Linux, use the sysfs mechanism to get the local cpus */
 #define HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX 128
   char path[HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX];
-  FILE *sysfile = NULL;
   int domainid, busid, deviceid;
 
   if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domainid, &busid, &deviceid))
@@ -108,15 +107,9 @@ hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
   }
 
   sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domainid, busid, deviceid);
-  sysfile = fopen(path, "r");
-  if (!sysfile)
-    return -1;
-
-  hwloc_linux_parse_cpumap_file(sysfile, set);
-  if (hwloc_bitmap_iszero(set))
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
     hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-  fclose(sysfile);
 #else
   /* Non-Linux systems simply get a full cpuset */
   hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
@@ -151,12 +144,14 @@ hwloc_cuda_get_device_pcidev(hwloc_topology_t topology, CUdevice cudevice)
  * CUDA device \p cudevice. Return NULL if there is none.
  *
  * Topology \p topology and device \p cudevice must match the local machine.
- * I/O devices detection and the NVML component must be enabled in the topology.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
  * If not, the locality of the object may still be found using
  * hwloc_cuda_get_device_cpuset().
  *
+ * \note This function cannot work if PCI devices are filtered out.
+ *
  * \note The corresponding hwloc PCI device may be found by looking
- * at the result parent pointer.
+ * at the result parent pointer (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
@@ -179,6 +174,7 @@ hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
 		    && (int) pcidev->attr->pcidev.dev == dev
 		    && pcidev->attr->pcidev.func == 0)
 			return osdev;
+		/* if PCI are filtered out, we need a info attr to match on */
 	}
 
 	return NULL;
@@ -195,7 +191,7 @@ hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
  * I/O devices detection and the CUDA component must be enabled in the topology.
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  *
  * \note This function is identical to hwloc_cudart_get_device_osdev_by_index().
  */
diff --git a/ext/hwloc/include/hwloc/cudart.h b/ext/hwloc/include/hwloc/cudart.h
index 759c3cf4f..688b8421e 100644
--- a/ext/hwloc/include/hwloc/cudart.h
+++ b/ext/hwloc/include/hwloc/cudart.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2017 Inria.  All rights reserved.
  * Copyright © 2010-2011 Université Bordeaux
  * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -16,11 +16,11 @@
 #ifndef HWLOC_CUDART_H
 #define HWLOC_CUDART_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc/helper.h"
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
 #endif
 
 #include <cuda.h> /* for CUDA_VERSION */
@@ -93,7 +93,6 @@ hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unuse
   /* If we're on Linux, use the sysfs mechanism to get the local cpus */
 #define HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX 128
   char path[HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX];
-  FILE *sysfile = NULL;
   int domain, bus, dev;
 
   if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
@@ -104,16 +103,10 @@ hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unuse
     return -1;
   }
 
-  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domain, bus, dev);
-  sysfile = fopen(path, "r");
-  if (!sysfile)
-    return -1;
-
-  hwloc_linux_parse_cpumap_file(sysfile, set);
-  if (hwloc_bitmap_iszero(set))
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", (unsigned) domain, (unsigned) bus, (unsigned) dev);
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
     hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-  fclose(sysfile);
 #else
   /* Non-Linux systems simply get a full cpuset */
   hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
@@ -155,7 +148,7 @@ hwloc_cudart_get_device_pcidev(hwloc_topology_t topology, int idx)
  * hwloc_cudart_get_device_cpuset().
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  *
  * \note This function is identical to hwloc_cuda_get_device_osdev_by_index().
  */
diff --git a/ext/hwloc/include/hwloc/deprecated.h b/ext/hwloc/include/hwloc/deprecated.h
index c4370b60a..4a231f507 100644
--- a/ext/hwloc/include/hwloc/deprecated.h
+++ b/ext/hwloc/include/hwloc/deprecated.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -21,94 +21,188 @@
 extern "C" {
 #endif
 
+/* backward compat with v2.0 before WHOLE_SYSTEM renaming */
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED
+/* backward compat with v1.11 before System removal */
+#define HWLOC_OBJ_SYSTEM HWLOC_OBJ_MACHINE
 /* backward compat with v1.10 before Socket->Package renaming */
 #define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
 /* backward compat with v1.10 before Node->NUMANode clarification */
 #define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
 
-/** \brief Return an object type from the string
+/** \brief Insert a misc object by parent.
  *
- * \return -1 if unrecognized.
+ * Identical to hwloc_topology_insert_misc_object().
  */
-HWLOC_DECLSPEC hwloc_obj_type_t hwloc_obj_type_of_string (const char * string) __hwloc_attribute_pure __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+  return hwloc_topology_insert_misc_object(topology, parent, name);
+}
 
-/** \brief Stringify a given topology object into a human-readable form.
- *
- * \note This function is deprecated in favor of hwloc_obj_type_snprintf()
- * and hwloc_obj_attr_snprintf() since it is not very flexible and
- * only prints physical/OS indexes.
- *
- * Fill string \p string up to \p size characters with the description
- * of topology object \p obj in topology \p topology.
- *
- * If \p verbose is set, a longer description is used. Otherwise a
- * short description is used.
- *
- * \p indexprefix is used to prefix the \p os_index attribute number of
- * the object in the description. If \c NULL, the \c # character is used.
+/** \brief Stringify the cpuset containing a set of objects.
  *
  * If \p size is 0, \p string may safely be \c NULL.
  *
  * \return the number of character that were actually written if not truncating,
  * or that would have been written (not including the ending \\0).
  */
-HWLOC_DECLSPEC int hwloc_obj_snprintf(char * __hwloc_restrict string, size_t size,
-				      hwloc_topology_t topology, hwloc_obj_t obj,
-				      const char * __hwloc_restrict indexprefix, int verbose) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+  hwloc_bitmap_t set = hwloc_bitmap_alloc();
+  int res;
+  unsigned i;
 
-/** \brief Distribute \p n items over the topology under \p root
- *
- * Array \p cpuset will be filled with \p n cpusets recursively distributed
- * linearly over the topology under \p root, down to depth \p until (which can
- * be INT_MAX to distribute down to the finest level).
- *
- * This is typically useful when an application wants to distribute \p n
- * threads over a machine, giving each of them as much private cache as
- * possible and keeping them locally in number order.
- *
- * The caller may typically want to also call hwloc_bitmap_singlify()
- * before binding a thread so that it does not move at all.
+  hwloc_bitmap_zero(set);
+  for(i=0; i<nobj; i++)
+    if (objs[i]->cpuset)
+      hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+  res = hwloc_bitmap_snprintf(str, size, set);
+  hwloc_bitmap_free(set);
+  return res;
+}
+
+/** \brief Convert a type string into a type and some attributes.
  *
- * \note This function requires the \p root object to have a CPU set.
+ * Deprecated by hwloc_type_sscanf()
  */
-static __hwloc_inline void
-hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
-static __hwloc_inline void
-hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until)
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
 {
-  hwloc_distrib(topology, &root, 1, set, n, until, 0);
+  union hwloc_obj_attr_u attr;
+  int err = hwloc_type_sscanf(string, typep, &attr, sizeof(attr));
+  if (err < 0)
+    return err;
+  if (hwloc_obj_type_is_cache(*typep)) {
+    if (depthattrp)
+      *depthattrp = (int) attr.cache.depth;
+    if (typeattrp && typeattrsize >= sizeof(hwloc_obj_cache_type_t))
+      memcpy(typeattrp, &attr.cache.type, sizeof(hwloc_obj_cache_type_t));
+  } else if (*typep == HWLOC_OBJ_GROUP) {
+    if (depthattrp)
+      *depthattrp = (int) attr.group.depth;
+  }
+  return 0;
 }
 
-/** \brief Distribute \p n items over the topology under \p roots
- *
- * This is the same as hwloc_distribute, but takes an array of roots instead of
- * just one root.
- *
- * \note This function requires the \p roots objects to have a CPU set.
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ */
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_membind(topology, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ */
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ */
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_proc_membind(topology, pid, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ */
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_set_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ */
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags) __hwloc_attribute_deprecated;
+static __hwloc_inline int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  return hwloc_get_area_membind(topology, addr, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Allocate some memory on the given nodeset \p nodeset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc __hwloc_attribute_deprecated;
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  return hwloc_alloc_membind_policy(topology, len, nodeset, policy, flags | HWLOC_MEMBIND_BYNODESET);
+}
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
  */
 static __hwloc_inline void
-hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset) __hwloc_attribute_deprecated;
 static __hwloc_inline void
-hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until)
+hwloc_cpuset_to_nodeset_strict(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
 {
-  hwloc_distrib(topology, roots, n_roots, set, n, until, 0);
+  hwloc_cpuset_to_nodeset(topology, _cpuset, nodeset);
 }
 
-/** \brief Insert a misc object by parent.
- *
- * Identical to hwloc_topology_insert_misc_object().
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
  */
-static __hwloc_inline hwloc_obj_t
-hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
-static __hwloc_inline hwloc_obj_t
-hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
 {
-  return hwloc_topology_insert_misc_object(topology, parent, name);
+  hwloc_cpuset_from_nodeset(topology, _cpuset, nodeset);
 }
 
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 
 
-#endif /* HWLOC_INLINES_H */
+#endif /* HWLOC_DEPRECATED_H */
diff --git a/ext/hwloc/include/hwloc/diff.h b/ext/hwloc/include/hwloc/diff.h
index 3f1beb126..79f2df3de 100644
--- a/ext/hwloc/include/hwloc/diff.h
+++ b/ext/hwloc/include/hwloc/diff.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2013-2014 Inria.  All rights reserved.
+ * Copyright © 2013-2018 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -43,6 +43,8 @@ extern "C" {
  * More complex differences such as adding or removing objects cannot
  * be represented in the difference structures and therefore return
  * errors.
+ * Differences between object sets or topology-wide allowed sets,
+ * cannot be represented either.
  *
  * It means that there is no need to apply the difference when
  * looking at the tree organization (how many levels, how many
@@ -59,19 +61,19 @@ extern "C" {
  */
 typedef enum hwloc_topology_diff_obj_attr_type_e {
   /** \brief The object local memory is modified.
-   * The union is a hwloc_topology_diff_obj_attr_uint64_s
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_uint64_s
    * (and the index field is ignored).
    */
   HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
 
   /** \brief The object name is modified.
-   * The union is a hwloc_topology_diff_obj_attr_string_s
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s
    * (and the name field is ignored).
    */
 
   HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
   /** \brief the value of an info attribute is modified.
-   * The union is a hwloc_topology_diff_obj_attr_string_s.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_string_s.
    */
   HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
 } hwloc_topology_diff_obj_attr_type_t;
@@ -107,17 +109,17 @@ union hwloc_topology_diff_obj_attr_u {
 /** \brief Type of one element of a difference list.
  */
 typedef enum hwloc_topology_diff_type_e {
-  /*< \brief An object attribute was changed.
-  * The union is a hwloc_topology_diff_obj_attr_s.
-  */
+  /** \brief An object attribute was changed.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_obj_attr_s.
+   */
   HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
 
-  /*< \brief The difference is too complex,
+  /** \brief The difference is too complex,
    * it cannot be represented. The difference below
    * this object has not been checked.
    * hwloc_topology_diff_build() will return 1.
    *
-   * The union is a hwloc_topology_diff_too_complex_s.
+   * The union is a hwloc_topology_diff_obj_attr_u::hwloc_topology_diff_too_complex_s.
    */
   HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
 } hwloc_topology_diff_type_t;
@@ -133,20 +135,20 @@ typedef union hwloc_topology_diff_u {
 
   /* A difference in an object attribute. */
   struct hwloc_topology_diff_obj_attr_s {
-    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
     union hwloc_topology_diff_u * next;
     /* List of attribute differences for a single object */
-    unsigned obj_depth;
+    int obj_depth;
     unsigned obj_index;
     union hwloc_topology_diff_obj_attr_u diff;
   } obj_attr;
 
   /* A difference that is too complex. */
   struct hwloc_topology_diff_too_complex_s {
-    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+    hwloc_topology_diff_type_t type; /* must be ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
     union hwloc_topology_diff_u * next;
     /* Where we had to stop computing the diff in the first topology */
-    unsigned obj_depth;
+    int obj_depth;
     unsigned obj_index;
   } too_complex;
 } * hwloc_topology_diff_t;
@@ -154,14 +156,14 @@ typedef union hwloc_topology_diff_u {
 
 /** \brief Compute the difference between 2 topologies.
  *
- * The difference is stored as a list of hwloc_topology_diff_t entries
+ * The difference is stored as a list of ::hwloc_topology_diff_t entries
  * starting at \p diff.
  * It is computed by doing a depth-first traversal of both topology trees
  * simultaneously.
  *
  * If the difference between 2 objects is too complex to be represented
  * (for instance if some objects have different types, or different numbers
- * of children), a special diff entry of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * of children), a special diff entry of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
  * is queued.
  * The computation of the diff does not continue below these objects.
  * So each such diff entry means that the difference between two subtrees
@@ -173,7 +175,7 @@ typedef union hwloc_topology_diff_u {
  * between the topologies.
  *
  * \return 1 if the difference is too complex (see above). Some entries in
- * the list will be of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ * the list will be of type ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
  *
  * \return -1 on any other error.
  *
@@ -183,7 +185,7 @@ typedef union hwloc_topology_diff_u {
  *
  * \note The output diff can only be exported to XML or passed to
  * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
- * HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ * ::HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
  *
  * \note The output diff may be modified by removing some entries from
  * the list. The removed entries should be freed by passing them to
@@ -202,7 +204,7 @@ enum hwloc_topology_diff_apply_flags_e {
 
 /** \brief Apply a topology diff to an existing topology.
  *
- * \p flags is an OR'ed set of hwloc_topology_diff_apply_flags_e.
+ * \p flags is an OR'ed set of ::hwloc_topology_diff_apply_flags_e.
  *
  * The new topology is modified in place. hwloc_topology_dup()
  * may be used to duplicate it before patching.
@@ -220,11 +222,8 @@ enum hwloc_topology_diff_apply_flags_e {
 HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
 
 /** \brief Destroy a list of topology differences.
- *
- * \note The \p topology parameter must be a valid topology
- * but it is not required that it is related to \p diff.
  */
-HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_t topology, hwloc_topology_diff_t diff);
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff);
 
 /** \brief Load a list of topology differences from a XML file.
  *
@@ -234,13 +233,10 @@ HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_t topology, hwloc_
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  *
- * \note The \p topology parameter must be a valid topology
- * but it is not required that it is related to \p diff.
- *
  * \note the pointer returned in refname should later be freed
  * by the caller.
  */
-HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(hwloc_topology_t topology, const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
 
 /** \brief Export a list of topology differences to a XML file.
  *
@@ -250,11 +246,8 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(hwloc_topology_t topology, const
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  * This attribute is given back when reading the diff from XML.
- *
- * \note The \p topology parameter must be a valid topology
- * but it is not required that it is related to \p diff.
  */
-HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
 
 /** \brief Load a list of topology differences from a XML buffer.
  *
@@ -264,13 +257,10 @@ HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_t topology, hwl
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  *
- * \note The \p topology parameter must be a valid topology
- * but it is not required that it is related to \p diff.
- *
  * \note the pointer returned in refname should later be freed
  * by the caller.
   */
-HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(hwloc_topology_t topology, const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
 
 /** \brief Export a list of topology differences to a XML buffer.
  *
@@ -281,12 +271,12 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(hwloc_topology_t topology,
  * that contains the reference topology.
  * This attribute is given back when reading the diff from XML.
  *
- * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
  *
- * \note The \p topology parameter must be a valid topology
- * but it is not required that it is related to \p diff.
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
  */
-HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
 
 /** @} */
 
@@ -296,4 +286,4 @@ HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_t topolog
 #endif
 
 
-#endif /* HWLOC_HELPER_H */
+#endif /* HWLOC_DIFF_H */
diff --git a/ext/hwloc/include/hwloc/distances.h b/ext/hwloc/include/hwloc/distances.h
new file mode 100644
index 000000000..b7baed8a4
--- /dev/null
+++ b/ext/hwloc/include/hwloc/distances.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright © 2010-2019 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Object distances.
+ */
+
+#ifndef HWLOC_DISTANCES_H
+#define HWLOC_DISTANCES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_distances_get Retrieve distances between objects
+ * @{
+ */
+
+/** \brief Matrix of distances between a set of objects.
+ *
+ * This matrix often contains latencies between NUMA nodes
+ * (as reported in the System Locality Distance Information Table (SLIT)
+ * in the ACPI specification), which may or may not be physically accurate.
+ * It corresponds to the latency for accessing the memory of one node
+ * from a core in another node.
+ * The corresponding kind is ::HWLOC_DISTANCES_KIND_FROM_OS | ::HWLOC_DISTANCES_KIND_FROM_USER.
+ *
+ * The matrix may also contain bandwidths between random sets of objects,
+ * possibly provided by the user, as specified in the \p kind attribute.
+ */
+struct hwloc_distances_s {
+  unsigned nbobjs;		/**< \brief Number of objects described by the distance matrix. */
+  hwloc_obj_t *objs;		/**< \brief Array of objects described by the distance matrix.
+				 * These objects are not in any particular order,
+				 * see hwloc_distances_obj_index() and hwloc_distances_obj_pair_values()
+				 * for easy ways to find objects in this array and their corresponding values.
+				 */
+  unsigned long kind;		/**< \brief OR'ed set of ::hwloc_distances_kind_e. */
+  hwloc_uint64_t *values;	/**< \brief Matrix of distances between objects, stored as a one-dimension array.
+				 *
+				 * Distance from i-th to j-th object is stored in slot i*nbobjs+j.
+				 * The meaning of the value depends on the \p kind attribute.
+				 */
+};
+
+/** \brief Kinds of distance matrices.
+ *
+ * The \p kind attribute of struct hwloc_distances_s is a OR'ed set
+ * of kinds.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_FROM_* specifies where the
+ * distance information comes from, if known.
+ *
+ * A kind of format HWLOC_DISTANCES_KIND_MEANS_* specifies whether
+ * values are latencies or bandwidths, if applicable.
+ */
+enum hwloc_distances_kind_e {
+  /** \brief These distances were obtained from the operating system or hardware.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_OS = (1UL<<0),
+  /** \brief These distances were provided by the user.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_FROM_USER = (1UL<<1),
+
+  /** \brief Distance values are similar to latencies between objects.
+   * Values are smaller for closer objects, hence minimal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * It could also be the number of network hops between objects, etc.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_LATENCY = (1UL<<2),
+  /** \brief Distance values are similar to bandwidths between objects.
+   * Values are higher for closer objects, hence maximal on the diagonal
+   * of the matrix (distance between an object and itself).
+   * Such values are currently ignored for distance-based grouping.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH = (1UL<<3),
+
+  /** \brief This distances structure covers objects of different types.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES = (1UL<<4)
+};
+
+/** \brief Retrieve distance matrices.
+ *
+ * Retrieve distance matrices from the topology into the \p distances array.
+ *
+ * \p flags is currently unused, should be \c 0.
+ *
+ * \p kind serves as a filter. If \c 0, all distance matrices are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_FROM_*, only distance matrices
+ * whose kind matches one of these are returned.
+ * If it contains some HWLOC_DISTANCES_KIND_MEANS_*, only distance matrices
+ * whose kind matches one of these are returned.
+ *
+ * On input, \p nr points to the number of distance matrices that may be stored
+ * in \p distances.
+ * On output, \p nr points to the number of distance matrices that were actually
+ * found, even if some of them couldn't be stored in \p distances.
+ * Distance matrices that couldn't be stored are ignored, but the function still
+ * returns success (\c 0). The caller may find out by comparing the value pointed
+ * by \p nr before and after the function call.
+ *
+ * Each distance matrix returned in the \p distances array should be released
+ * by the caller using hwloc_distances_release().
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get(hwloc_topology_t topology,
+		    unsigned *nr, struct hwloc_distances_s **distances,
+		    unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_get() with the additional \p depth filter.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
+			     unsigned *nr, struct hwloc_distances_s **distances,
+			     unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve distance matrices for object of a specific type.
+ *
+ * Identical to hwloc_distances_get() with the additional \p type filter.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
+			    unsigned *nr, struct hwloc_distances_s **distances,
+			    unsigned long kind, unsigned long flags);
+
+/** \brief Retrieve a distance matrix with the given name.
+ *
+ * Usually only one distances structure may match a given name.
+ */
+HWLOC_DECLSPEC int
+hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
+			    unsigned *nr, struct hwloc_distances_s **distances,
+			    unsigned long flags);
+
+/** \brief Get a description of what a distances structure contains.
+ *
+ * For instance "NUMALatency" for hardware-provided NUMA distances (ACPI SLIT),
+ * or NULL if unknown.
+ */
+HWLOC_DECLSPEC const char *
+hwloc_distances_get_name(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** \brief Release a distance matrix structure previously returned by hwloc_distances_get().
+ *
+ * \note This function is not required if the structure is removed with hwloc_distances_release_remove().
+ */
+HWLOC_DECLSPEC void
+hwloc_distances_release(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_consult Helpers for consulting distance matrices
+ * @{
+ */
+
+/** \brief Find the index of an object in a distances structure.
+ *
+ * \return -1 if object \p obj is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_index(struct hwloc_distances_s *distances, hwloc_obj_t obj)
+{
+  unsigned i;
+  for(i=0; i<distances->nbobjs; i++)
+    if (distances->objs[i] == obj)
+      return (int)i;
+  return -1;
+}
+
+/** \brief Find the values between two objects in a distance matrices.
+ *
+ * The distance from \p obj1 to \p obj2 is stored in the value pointed by
+ * \p value1to2 and reciprocally.
+ *
+ * \return -1 if object \p obj1 or \p obj2 is not involved in structure \p distances.
+ */
+static __hwloc_inline int
+hwloc_distances_obj_pair_values(struct hwloc_distances_s *distances,
+				hwloc_obj_t obj1, hwloc_obj_t obj2,
+				hwloc_uint64_t *value1to2, hwloc_uint64_t *value2to1)
+{
+  int i1 = hwloc_distances_obj_index(distances, obj1);
+  int i2 = hwloc_distances_obj_index(distances, obj2);
+  if (i1 < 0 || i2 < 0)
+    return -1;
+  *value1to2 = distances->values[i1 * distances->nbobjs + i2];
+  *value2to1 = distances->values[i2 * distances->nbobjs + i1];
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances_add Add or remove distances between objects
+ * @{
+ */
+
+/** \brief Flags for adding a new distances to a topology. */
+enum hwloc_distances_add_flag_e {
+  /** \brief Try to group objects based on the newly provided distance information.
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP = (1UL<<0),
+  /** \brief If grouping, consider the distance values as inaccurate and relax the
+   * comparisons during the grouping algorithms. The actual accuracy may be modified
+   * through the HWLOC_GROUPING_ACCURACY environment variable (see \ref envvar).
+   * \hideinitializer
+   */
+  HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE = (1UL<<1)
+};
+
+/** \brief Provide a new distance matrix.
+ *
+ * Provide the matrix of distances between a set of objects given by \p nbobjs
+ * and the \p objs array. \p nbobjs must be at least 2.
+ * The distances are stored as a one-dimension array in \p values.
+ * The distance from object i to object j is in slot i*nbobjs+j.
+ *
+ * \p kind specifies the kind of distance as a OR'ed set of ::hwloc_distances_kind_e.
+ * Kind ::HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES will be automatically added
+ * if objects of different types are given.
+ *
+ * \p flags configures the behavior of the function using an optional OR'ed set of
+ * ::hwloc_distances_add_flag_e.
+ */
+HWLOC_DECLSPEC int hwloc_distances_add(hwloc_topology_t topology,
+				       unsigned nbobjs, hwloc_obj_t *objs, hwloc_uint64_t *values,
+				       unsigned long kind, unsigned long flags);
+
+/** \brief Remove all distance matrices from a topology.
+ *
+ * Remove all distance matrices, either provided by the user or
+ * gathered through the OS.
+ *
+ * If these distances were used to group objects, these additional
+ * Group objects are not removed from the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove(hwloc_topology_t topology);
+
+/** \brief Remove distance matrices for objects at a specific depth in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+HWLOC_DECLSPEC int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth);
+
+/** \brief Remove distance matrices for objects of a specific type in the topology.
+ *
+ * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ */
+static __hwloc_inline int
+hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return 0;
+  return hwloc_distances_remove_by_depth(topology, depth);
+}
+
+/** \brief Release and remove the given distance matrice from the topology.
+ *
+ * This function includes a call to hwloc_distances_release().
+ */
+HWLOC_DECLSPEC int hwloc_distances_release_remove(hwloc_topology_t topology, struct hwloc_distances_s *distances);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_DISTANCES_H */
diff --git a/ext/hwloc/include/hwloc/export.h b/ext/hwloc/include/hwloc/export.h
index 194ee6cae..b178b77e5 100644
--- a/ext/hwloc/include/hwloc/export.h
+++ b/ext/hwloc/include/hwloc/export.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -28,10 +28,33 @@ extern "C" {
  * @{
  */
 
+/** \brief Flags for exporting XML topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_xml().
+ */
+enum hwloc_topology_export_xml_flags_e {
+ /** \brief Export XML that is loadable by hwloc v1.x.
+  * However, the export may miss some details about the topology.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 = (1UL<<0)
+};
+
 /** \brief Export the topology into an XML file.
  *
  * This file may be loaded later through hwloc_topology_set_xml().
  *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported file may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
  * \return -1 if a failure occured.
  *
  * \note See also hwloc_topology_set_userdata_export_callback()
@@ -45,7 +68,7 @@ extern "C" {
  *
  * \note If \p name is "-", the XML output is sent to the standard output.
  */
-HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath);
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath, unsigned long flags);
 
 /** \brief Export the topology into a newly-allocated XML memory buffer.
  *
@@ -54,6 +77,20 @@ HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const ch
  *
  * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
  *
+ * By default, the latest export format is used, which means older hwloc
+ * releases (e.g. v1.x) will not be able to import it.
+ * Exporting to v1.x specific XML format is possible using flag
+ * ::HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 but it may miss some details
+ * about the topology.
+ * If there is any chance that the exported buffer may ever be imported
+ * back by a process using hwloc 1.x, one should consider detecting
+ * it at runtime and using the corresponding export format.
+ *
+ * The returned buffer ends with a \0 that is included in the returned
+ * length.
+ *
+ * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
+ *
  * \return -1 if a failure occured.
  *
  * \note See also hwloc_topology_set_userdata_export_callback()
@@ -65,7 +102,7 @@ HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const ch
  * Any other character, especially any non-ASCII character, will be silently
  * dropped.
  */
-HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen);
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen, unsigned long flags);
 
 /** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
 HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
@@ -183,7 +220,27 @@ enum hwloc_topology_export_synthetic_flags_e {
   * This is required if loading the synthetic description with hwloc < 1.10.
   * \hideinitializer
   */
- HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1)
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1),
+
+ /** \brief Export the memory hierarchy as expected in hwloc 1.x.
+  *
+  * Instead of attaching memory children to levels, export single NUMA node child
+  * as normal intermediate levels, when possible.
+  * This is required if loading the synthetic description with hwloc 1.x.
+  * However this may fail if some objects have multiple local NUMA nodes.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 = (1UL<<2),
+
+ /** \brief Do not export memory information.
+  *
+  * Only export the actual hierarchy of normal CPU-side objects and ignore
+  * where memory is attached.
+  * This is useful for when the hierarchy of CPUs is what really matters,
+  * but it behaves as if there was a single machine-wide NUMA node.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY = (1UL<<3)
 };
 
 /** \brief Export the topology as a synthetic string.
@@ -193,7 +250,7 @@ enum hwloc_topology_export_synthetic_flags_e {
  *
  * This exported string may be given back to hwloc_topology_set_synthetic().
  *
- * \p flags is a OR'ed set of hwloc_topology_export_synthetic_flags_e.
+ * \p flags is a OR'ed set of ::hwloc_topology_export_synthetic_flags_e.
  *
  * \return The number of characters that were written,
  * not including the terminating \0.
diff --git a/ext/hwloc/include/hwloc/gl.h b/ext/hwloc/include/hwloc/gl.h
index 4b8b3f230..897ef784b 100644
--- a/ext/hwloc/include/hwloc/gl.h
+++ b/ext/hwloc/include/hwloc/gl.h
@@ -14,7 +14,7 @@
 #ifndef HWLOC_GL_H
 #define HWLOC_GL_H
 
-#include <hwloc.h>
+#include "hwloc.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -48,7 +48,7 @@ extern "C" {
  * I/O devices detection and the GL component must be enabled in the topology.
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
@@ -79,7 +79,7 @@ hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
  * I/O devices detection and the GL component must be enabled in the topology.
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_gl_get_display_osdev_by_name(hwloc_topology_t topology,
diff --git a/ext/hwloc/include/hwloc/glibc-sched.h b/ext/hwloc/include/hwloc/glibc-sched.h
index 1f9ba7cdd..99659e03c 100644
--- a/ext/hwloc/include/hwloc/glibc-sched.h
+++ b/ext/hwloc/include/hwloc/glibc-sched.h
@@ -17,8 +17,9 @@
 #ifndef HWLOC_GLIBC_SCHED_H
 #define HWLOC_GLIBC_SCHED_H
 
-#include <hwloc.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/helper.h"
+
 #include <assert.h>
 
 #if !defined _GNU_SOURCE || !defined _SCHED_H || (!defined CPU_SETSIZE && !defined sched_priority)
diff --git a/ext/hwloc/include/hwloc/helper.h b/ext/hwloc/include/hwloc/helper.h
index 883b87d52..bc27be591 100644
--- a/ext/hwloc/include/hwloc/helper.h
+++ b/ext/hwloc/include/hwloc/helper.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -75,17 +75,20 @@ HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topolo
  * included in \p set.  The next invokation should pass the previous
  * return value in \p prev so as to obtain the next object in \p set.
  *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
  * \note This function cannot work if objects at the given depth do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
-					   unsigned depth, hwloc_obj_t prev)
+					   int depth, hwloc_obj_t prev)
 {
   hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
   if (!next)
     return NULL;
-  while (next && !hwloc_bitmap_isincluded(next->cpuset, set))
+  while (next && (hwloc_bitmap_iszero(next->cpuset) || !hwloc_bitmap_isincluded(next->cpuset, set)))
     next = next->next_cousin;
   return next;
 }
@@ -96,8 +99,11 @@ hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_cons
  * and let the caller fallback to
  * hwloc_get_next_obj_inside_cpuset_by_depth().
  *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
  * \note This function cannot work if objects of the given type do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
@@ -110,23 +116,26 @@ hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const
 }
 
 /** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
  *
  * \note This function cannot work if objects at the given depth do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
-				      unsigned depth, unsigned idx) __hwloc_attribute_pure;
+				      int depth, unsigned idx) __hwloc_attribute_pure;
 static __hwloc_inline hwloc_obj_t
 hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
-				      unsigned depth, unsigned idx)
+				      int depth, unsigned idx)
 {
   hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
   unsigned count = 0;
   if (!obj)
     return NULL;
   while (obj) {
-    if (hwloc_bitmap_isincluded(obj->cpuset, set)) {
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set)) {
       if (count == idx)
 	return obj;
       count++;
@@ -142,8 +151,11 @@ hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpu
  * and let the caller fallback to
  * hwloc_get_obj_inside_cpuset_by_depth().
  *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
  * \note This function cannot work if objects of the given type do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
@@ -159,23 +171,26 @@ hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpus
 }
 
 /** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
  *
  * \note This function cannot work if objects at the given depth do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline unsigned
 hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
-					 unsigned depth) __hwloc_attribute_pure;
+					 int depth) __hwloc_attribute_pure;
 static __hwloc_inline unsigned
 hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
-					 unsigned depth)
+					 int depth)
 {
   hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
   unsigned count = 0;
   if (!obj)
     return 0;
   while (obj) {
-    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
       count++;
     obj = obj->next_cousin;
   }
@@ -188,6 +203,9 @@ hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_
  * returned.  If there are several levels with objects of that type
  * inside CPU set \p set, -1 is returned.
  *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
  * \note This function cannot work if objects of the given type do
  * not have CPU sets (I/O objects).
  */
@@ -203,7 +221,7 @@ hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_c
     return 0;
   if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
     return -1; /* FIXME: agregate nbobjs from different levels? */
-  return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+  return (int) hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
 }
 
 /** \brief Return the logical index among the objects included in CPU set \p set.
@@ -214,6 +232,9 @@ hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_c
  * Otherwise, this is similar to a logical index within the part of the topology
  * defined by CPU set \p set.
  *
+ * \note Objects with empty CPU sets are ignored
+ * (otherwise they would be considered included in any given set).
+ *
  * \note This function cannot work if obj does not have CPU sets (I/O objects).
  */
 static __hwloc_inline int
@@ -228,7 +249,7 @@ hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_u
     return -1;
   /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
   while ((obj = obj->prev_cousin) != NULL)
-    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+    if (!hwloc_bitmap_iszero(obj->cpuset) && hwloc_bitmap_isincluded(obj->cpuset, set))
       idx++;
   return idx;
 }
@@ -245,7 +266,7 @@ hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_u
  *
  * \return \c NULL if no child matches or if \p set is empty.
  *
- * \note This function cannot work if parent does not have a CPU set (I/O objects).
+ * \note This function cannot work if parent does not have a CPU set (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
@@ -294,11 +315,11 @@ hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t s
  * to obtain the next object covering at least another part of \p set.
  *
  * \note This function cannot work if objects at the given depth do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
-					    unsigned depth, hwloc_obj_t prev)
+					    int depth, hwloc_obj_t prev)
 {
   hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
   if (!next)
@@ -321,7 +342,7 @@ hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_con
  * for each depth.
  *
  * \note This function cannot work if objects of the given type do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
@@ -346,11 +367,17 @@ hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_cons
  * package has fewer caches than its peers.
  */
 
-/** \brief Returns the ancestor object of \p obj at depth \p depth. */
+/** \brief Returns the ancestor object of \p obj at depth \p depth.
+ *
+ * \note \p depth should not be the depth of PU or NUMA objects
+ * since they are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate level depth,
+ * such as the depth of Packages, Cores, or Caches.
+ */
 static __hwloc_inline hwloc_obj_t
-hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj) __hwloc_attribute_pure;
 static __hwloc_inline hwloc_obj_t
-hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj)
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, int depth, hwloc_obj_t obj)
 {
   hwloc_obj_t ancestor = obj;
   if (obj->depth < depth)
@@ -360,7 +387,13 @@ hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unu
   return ancestor;
 }
 
-/** \brief Returns the ancestor object of \p obj with type \p type. */
+/** \brief Returns the ancestor object of \p obj with type \p type.
+ *
+ * \note \p type should not be ::HWLOC_OBJ_PU or ::HWLOC_OBJ_NUMANODE
+ * since these objects are ancestors of no objects (except Misc or I/O).
+ * This function rather expects an intermediate object type,
+ * such as ::HWLOC_OBJ_PACKAGE, ::HWLOC_OBJ_CORE, etc.
+ */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
 static __hwloc_inline hwloc_obj_t
@@ -372,7 +405,7 @@ hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unus
   return ancestor;
 }
 
-/** \brief Returns the common parent object to objects lvl1 and lvl2 */
+/** \brief Returns the common parent object to objects \p obj1 and \p obj2 */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
 static __hwloc_inline hwloc_obj_t
@@ -399,7 +432,7 @@ hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unuse
 /** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
  *
  * \note This function cannot work if \p obj and \p subtree_root objects do
- * not have CPU sets (I/O objects).
+ * not have CPU sets (I/O or Misc objects).
  */
 static __hwloc_inline int
 hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
@@ -411,7 +444,8 @@ hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwl
 
 /** \brief Return the next child.
  *
- * Return the next child among the normal children list, then among the I/O
+ * Return the next child among the normal children list,
+ * then among the memory children list, then among the I/O
  * children list, then among the Misc children list.
  *
  * If \p prev is \c NULL, return the first child.
@@ -425,21 +459,27 @@ hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_
   int state = 0;
   if (prev) {
     if (prev->type == HWLOC_OBJ_MISC)
-      state = 2;
+      state = 3;
     else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+      state = 2;
+    else if (prev->type == HWLOC_OBJ_NUMANODE)
       state = 1;
     obj = prev->next_sibling;
   } else {
     obj = parent->first_child;
   }
   if (!obj && state == 0) {
-    obj = parent->io_first_child;
+    obj = parent->memory_first_child;
     state = 1;
   }
   if (!obj && state == 1) {
-    obj = parent->misc_first_child;
+    obj = parent->io_first_child;
     state = 2;
   }
+  if (!obj && state == 2) {
+    obj = parent->misc_first_child;
+    state = 3;
+  }
   return obj;
 }
 
@@ -447,28 +487,108 @@ hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_
 
 
 
+/** \defgroup hwlocality_helper_types Kinds of object Type
+ * @{
+ *
+ * Each object type is
+ * either Normal (i.e. hwloc_obj_type_is_normal() returns 1),
+ * or Memory (i.e. hwloc_obj_type_is_memory() returns 1)
+ * or I/O (i.e. hwloc_obj_type_is_io() returns 1)
+ * or Misc (i.e. equal to ::HWLOC_OBJ_MISC).
+ * It cannot be of more than one of these kinds.
+ */
+
+/** \brief Check whether an object type is Normal.
+ *
+ * Normal objects are objects of the main CPU hierarchy
+ * (Machine, Package, Core, PU, CPU caches, etc.),
+ * but they are not NUMA nodes, I/O devices or Misc objects.
+ *
+ * They are attached to parent as Normal children,
+ * not as Memory, I/O or Misc children.
+ *
+ * \return 1 if an object of type \p type is a Normal object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is I/O.
+ *
+ * I/O objects are objects attached to their parents
+ * in the I/O children list.
+ * This current includes Bridges, PCI and OS devices.
+ *
+ * \return 1 if an object of type \p type is a I/O object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_io(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is Memory.
+ *
+ * Memory objects are objects attached to their parents
+ * in the Memory children list.
+ * This current includes NUMA nodes and Memory-side caches.
+ *
+ * \return 1 if an object of type \p type is a Memory object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Cache (Data, Unified or Instruction).
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Data or Unified Cache.
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Data or Unified Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Instruction Cache,
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Instruction Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type);
+
+/** @} */
+
+
+
 /** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
  * @{
  */
 
-/** \brief Find the depth of cache objects matching cache depth and type.
+/** \brief Find the depth of cache objects matching cache level and type.
  *
  * Return the depth of the topology level that contains cache objects
- * whose attributes match \p cachedepth and \p cachetype. This function
- * intends to disambiguate the case where hwloc_get_type_depth() returns
- * \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ * whose attributes match \p cachelevel and \p cachetype.
+
+ * This function is identical to calling hwloc_get_type_depth() with the
+ * corresponding type such as ::HWLOC_OBJ_L1ICACHE, except that it may
+ * also return a Unified cache when looking for an instruction cache.
  *
- * If no cache level matches, \p HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ * If no cache level matches, ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
  *
- * If \p cachetype is \p HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED, the depth of the
  * unique matching unified cache level is returned.
  *
- * If \p cachetype is \p HWLOC_OBJ_CACHE_DATA or \p HWLOC_OBJ_CACHE_INSTRUCTION,
+ * If \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION,
  * either a matching cache, or a unified cache is returned.
  *
  * If \p cachetype is \c -1, it is ignored and multiple levels may
  * match. The function returns either the depth of a uniquely matching
- * level or \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ * level or ::HWLOC_TYPE_DEPTH_MULTIPLE.
  */
 static __hwloc_inline int
 hwloc_get_cache_type_depth (hwloc_topology_t topology,
@@ -480,7 +600,7 @@ hwloc_get_cache_type_depth (hwloc_topology_t topology,
     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
     if (!obj)
       break;
-    if (obj->type != HWLOC_OBJ_CACHE || obj->attr->cache.depth != cachelevel)
+    if (!hwloc_obj_type_is_dcache(obj->type) || obj->attr->cache.depth != cachelevel)
       /* doesn't match, try next depth */
       continue;
     if (cachetype == (hwloc_obj_cache_type_t) -1) {
@@ -500,7 +620,7 @@ hwloc_get_cache_type_depth (hwloc_topology_t topology,
   return found;
 }
 
-/** \brief Get the first cache covering a cpuset \p set
+/** \brief Get the first data (or unified) cache covering a cpuset \p set
  *
  * \return \c NULL if no cache matches.
  */
@@ -511,14 +631,14 @@ hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t
 {
   hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
   while (current) {
-    if (current->type == HWLOC_OBJ_CACHE)
+    if (hwloc_obj_type_is_dcache(current->type))
       return current;
     current = current->parent;
   }
   return NULL;
 }
 
-/** \brief Get the first cache shared between an object and somebody else.
+/** \brief Get the first data (or unified) cache shared between an object and somebody else.
  *
  * \return \c NULL if no cache matches or if an invalid object is given.
  */
@@ -532,7 +652,7 @@ hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute
     return NULL;
   while (current) {
     if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
-        && current->type == HWLOC_OBJ_CACHE)
+        && hwloc_obj_type_is_dcache(current->type))
       return current;
     current = current->parent;
   }
@@ -577,7 +697,7 @@ hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
  *
  * This function is useful for converting a nodeset into the NUMA node
  * objects it contains.
- * When retrieving the current binding (e.g. with hwloc_get_membind_nodeset()),
+ * When retrieving the current binding (e.g. with hwloc_get_membind() with HWLOC_MEMBIND_BYNODESET),
  * one may iterate over the bits of the resulting nodeset with
  * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
  * with this function.
@@ -714,7 +834,7 @@ hwloc_distrib(hwloc_topology_t topology,
 	      hwloc_obj_t *roots, unsigned n_roots,
 	      hwloc_cpuset_t *set,
 	      unsigned n,
-	      unsigned until, unsigned long flags)
+	      int until, unsigned long flags)
 {
   unsigned i;
   unsigned tot_weight;
@@ -728,13 +848,16 @@ hwloc_distrib(hwloc_topology_t topology,
 
   tot_weight = 0;
   for (i = 0; i < n_roots; i++)
-    tot_weight += hwloc_bitmap_weight(roots[i]->cpuset);
+    tot_weight += (unsigned) hwloc_bitmap_weight(roots[i]->cpuset);
 
   for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
     unsigned chunk, weight;
     hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
     hwloc_cpuset_t cpuset = root->cpuset;
-    weight = hwloc_bitmap_weight(cpuset);
+    if (root->type == HWLOC_OBJ_NUMANODE)
+      /* NUMANodes have same cpuset as their parent, but we need normal objects below */
+      root = root->parent;
+    weight = (unsigned) hwloc_bitmap_weight(cpuset);
     if (!weight)
       continue;
     /* Give to root a chunk proportional to its weight.
@@ -775,20 +898,18 @@ hwloc_distrib(hwloc_topology_t topology,
 /** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
  * @{
  */
+
 /** \brief Get complete CPU set
  *
  * \return the complete CPU set of logical processors of the system.
  *
  * \note The returned cpuset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete CPU-set.
  */
-static __hwloc_inline hwloc_const_cpuset_t
+HWLOC_DECLSPEC hwloc_const_cpuset_t
 hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_cpuset_t
-hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->complete_cpuset;
-}
 
 /** \brief Get topology CPU set
  *
@@ -798,29 +919,30 @@ hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
  *
  * \note The returned cpuset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object CPU-set.
  */
-static __hwloc_inline hwloc_const_cpuset_t
+HWLOC_DECLSPEC hwloc_const_cpuset_t
 hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_cpuset_t
-hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->cpuset;
-}
 
 /** \brief Get allowed CPU set
  *
  * \return the CPU set of allowed logical processors of the system.
  *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
+ * this is identical to hwloc_topology_get_topology_cpuset(), which means
+ * all PUs are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * cpuset checks whether there are allowed PUs inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed PUs.
+ *
  * \note The returned cpuset is not newly allocated and should thus not be
  * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
  */
-static __hwloc_inline hwloc_const_cpuset_t
+HWLOC_DECLSPEC hwloc_const_cpuset_t
 hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_cpuset_t
-hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->allowed_cpuset;
-}
 
 /** \brief Get complete node set
  *
@@ -828,14 +950,11 @@ hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
  *
  * \note The returned nodeset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object complete nodeset.
  */
-static __hwloc_inline hwloc_const_nodeset_t
+HWLOC_DECLSPEC hwloc_const_nodeset_t
 hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_nodeset_t
-hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->complete_nodeset;
-}
 
 /** \brief Get topology node set
  *
@@ -845,29 +964,30 @@ hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
  *
  * \note The returned nodeset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ *
+ * \note This is equivalent to retrieving the root object nodeset.
  */
-static __hwloc_inline hwloc_const_nodeset_t
+HWLOC_DECLSPEC hwloc_const_nodeset_t
 hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_nodeset_t
-hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->nodeset;
-}
 
 /** \brief Get allowed node set
  *
  * \return the node set of allowed memory of the system.
  *
+ * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
+ * this is identical to hwloc_topology_get_topology_nodeset(), which means
+ * all NUMA nodes are allowed.
+ *
+ * \note If ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was set, applying
+ * hwloc_bitmap_intersects() on the result of this function and on an object
+ * nodeset checks whether there are allowed NUMA nodes inside that object.
+ * Applying hwloc_bitmap_and() returns the list of these allowed NUMA nodes.
+ *
  * \note The returned nodeset is not newly allocated and should thus not be
  * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
  */
-static __hwloc_inline hwloc_const_nodeset_t
+HWLOC_DECLSPEC hwloc_const_nodeset_t
 hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
-static __hwloc_inline hwloc_const_nodeset_t
-hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
-{
-  return hwloc_get_root_obj(topology)->allowed_nodeset;
-}
 
 /** @} */
 
@@ -875,17 +995,6 @@ hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
 
 /** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
  *
- * There are two semantics for converting cpusets to nodesets depending on how
- * non-NUMA machines are handled.
- *
- * When manipulating nodesets for memory binding, non-NUMA machines should be
- * considered as having a single NUMA node. The standard conversion routines
- * below should be used so that marking the first bit of the nodeset means
- * that memory should be bound to a non-NUMA whole machine.
- *
- * When manipulating nodesets as an actual list of NUMA nodes without any
- * need to handle memory binding on non-NUMA machines, the strict conversion
- * routines may be used instead.
  * @{
  */
 
@@ -899,7 +1008,7 @@ hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
  * If \p cpuset is empty, \p nodeset will be emptied as well.
  * Otherwise \p nodeset will be entirely filled.
  */
-static __hwloc_inline void
+static __hwloc_inline int
 hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
 {
 	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
@@ -907,25 +1016,9 @@ hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset,
 	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
 	hwloc_bitmap_zero(nodeset);
 	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
-		hwloc_bitmap_set(nodeset, obj->os_index);
-}
-
-/** \brief Convert a CPU set into a NUMA node set without handling non-NUMA cases
- *
- * This is the strict variant of ::hwloc_cpuset_to_nodeset. It does not fix
- * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
- * the same. However, if the topology contains no NUMA nodes, return an empty
- * nodeset.
- */
-static __hwloc_inline void
-hwloc_cpuset_to_nodeset_strict(struct hwloc_topology *topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
-{
-	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
-	hwloc_obj_t obj = NULL;
-	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
-	hwloc_bitmap_zero(nodeset);
-	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
-		hwloc_bitmap_set(nodeset, obj->os_index);
+		if (hwloc_bitmap_set(nodeset, obj->os_index) < 0)
+			return -1;
+	return 0;
 }
 
 /** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
@@ -936,7 +1029,7 @@ hwloc_cpuset_to_nodeset_strict(struct hwloc_topology *topology, hwloc_const_cpus
  * Otherwise \p cpuset will be entirely filled.
  * This is useful for manipulating memory binding sets.
  */
-static __hwloc_inline void
+static __hwloc_inline int
 hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
 {
 	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
@@ -946,169 +1039,10 @@ hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwl
 	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
 		if (hwloc_bitmap_isset(nodeset, obj->os_index))
 			/* no need to check obj->cpuset because objects in levels always have a cpuset */
-			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+			if (hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset) < 0)
+				return -1;
 	}
-}
-
-/** \brief Convert a NUMA node set into a CPU set without handling non-NUMA cases
- *
- * This is the strict variant of ::hwloc_cpuset_from_nodeset. It does not fix
- * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
- * the same. However, if the topology contains no NUMA nodes, return an empty
- * cpuset.
- */
-static __hwloc_inline void
-hwloc_cpuset_from_nodeset_strict(struct hwloc_topology *topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
-{
-	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
-	hwloc_obj_t obj = NULL;
-	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
-	hwloc_bitmap_zero(_cpuset);
-	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL)
-		if (hwloc_bitmap_isset(nodeset, obj->os_index))
-			/* no need to check obj->cpuset because objects in levels always have a cpuset */
-			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
-}
-
-/** @} */
-
-
-
-/** \defgroup hwlocality_distances Manipulating Distances
- * @{
- */
-
-/** \brief Get the distances between all objects at the given depth.
- *
- * \return a distances structure containing a matrix with all distances
- * between all objects at the given depth.
- *
- * Slot i+nbobjs*j contains the distance from the object of logical index i
- * the object of logical index j.
- *
- * \note This function only returns matrices covering the whole topology,
- * without any unknown distance value. Those matrices are available in
- * top-level object of the hierarchy. Matrices of lower objects are not
- * reported here since they cover only part of the machine.
- *
- * The returned structure belongs to the hwloc library. The caller should
- * not modify or free it.
- *
- * \return \c NULL if no such distance matrix exists.
- */
-
-static __hwloc_inline const struct hwloc_distances_s *
-hwloc_get_whole_distance_matrix_by_depth(hwloc_topology_t topology, unsigned depth)
-{
-  hwloc_obj_t root = hwloc_get_root_obj(topology);
-  unsigned i;
-  for(i=0; i<root->distances_count; i++)
-    if (root->distances[i]->relative_depth == depth)
-      return root->distances[i];
-  return NULL;
-}
-
-/** \brief Get the distances between all objects of a given type.
- *
- * \return a distances structure containing a matrix with all distances
- * between all objects of the given type.
- *
- * Slot i+nbobjs*j contains the distance from the object of logical index i
- * the object of logical index j.
- *
- * \note This function only returns matrices covering the whole topology,
- * without any unknown distance value. Those matrices are available in
- * top-level object of the hierarchy. Matrices of lower objects are not
- * reported here since they cover only part of the machine.
- *
- * The returned structure belongs to the hwloc library. The caller should
- * not modify or free it.
- *
- * \return \c NULL if no such distance matrix exists.
- */
-
-static __hwloc_inline const struct hwloc_distances_s *
-hwloc_get_whole_distance_matrix_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
-{
-  int depth = hwloc_get_type_depth(topology, type);
-  if (depth < 0)
-    return NULL;
-  return hwloc_get_whole_distance_matrix_by_depth(topology, depth);
-}
-
-/** \brief Get distances for the given depth and covering some objects
- *
- * Return a distance matrix that describes depth \p depth and covers at
- * least object \p obj and all its children.
- *
- * When looking for the distance between some objects, a common ancestor should
- * be passed in \p obj.
- *
- * \p firstp is set to logical index of the first object described by the matrix.
- *
- * The returned structure belongs to the hwloc library. The caller should
- * not modify or free it.
- */
-static __hwloc_inline const struct hwloc_distances_s *
-hwloc_get_distance_matrix_covering_obj_by_depth(hwloc_topology_t topology,
-						hwloc_obj_t obj, unsigned depth,
-						unsigned *firstp)
-{
-  if (!obj->cpuset)
-    return NULL;
-  while (obj) {
-    unsigned i;
-    for(i=0; i<obj->distances_count; i++)
-      if (obj->distances[i]->relative_depth == depth - obj->depth) {
-	if (!obj->distances[i]->nbobjs)
-	  continue;
-	*firstp = hwloc_get_next_obj_inside_cpuset_by_depth(topology, obj->cpuset, depth, NULL)->logical_index;
-	return obj->distances[i];
-      }
-    obj = obj->parent;
-  }
-  return NULL;
-}
-
-/** \brief Get the latency in both directions between two objects.
- *
- * Look at ancestor objects from the bottom to the top until one of them
- * contains a distance matrix that matches the objects exactly.
- *
- * \p latency gets the value from object \p obj1 to \p obj2, while
- * \p reverse_latency gets the reverse-direction value, which
- * may be different on some architectures.
- *
- * \return -1 if no ancestor contains a matching latency matrix.
- */
-static __hwloc_inline int
-hwloc_get_latency(hwloc_topology_t topology,
-		   hwloc_obj_t obj1, hwloc_obj_t obj2,
-		   float *latency, float *reverse_latency)
-{
-  hwloc_obj_t ancestor;
-  const struct hwloc_distances_s * distances;
-  unsigned first_logical ;
-
-  if (obj1->depth != obj2->depth) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  ancestor = hwloc_get_common_ancestor_obj(topology, obj1, obj2);
-  distances = hwloc_get_distance_matrix_covering_obj_by_depth(topology, ancestor, obj1->depth, &first_logical);
-  if (distances && distances->latency) {
-    const float * latency_matrix = distances->latency;
-    unsigned nbobjs = distances->nbobjs;
-    unsigned l1 = obj1->logical_index - first_logical;
-    unsigned l2 = obj2->logical_index - first_logical;
-    *latency = latency_matrix[l1*nbobjs+l2];
-    *reverse_latency = latency_matrix[l2*nbobjs+l1];
-    return 0;
-  }
-
-  errno = ENOSYS;
-  return -1;
+	return 0;
 }
 
 /** @} */
@@ -1122,8 +1056,13 @@ hwloc_get_latency(hwloc_topology_t topology,
 /** \brief Get the first non-I/O ancestor object.
  *
  * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
- * object. This regular object may then be used for binding because
- * its locality is the same as \p ioobj.
+ * object. This object (normal or memory) may then be used for binding
+ * because it has non-NULL CPU and node sets
+ * and because its locality is the same as \p ioobj.
+ *
+ * \note The resulting object is usually a normal object but it could also
+ * be a memory object (e.g. NUMA node) in future platforms if I/O objects
+ * ever get attached to memory instead of CPUs.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
@@ -1215,28 +1154,6 @@ hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
     && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
 }
 
-/** \brief Find the hostbridge that covers the given PCI bus.
- *
- * This is useful for finding the locality of a bus because
- * it is the hostbridge parent cpuset.
- */
-static __hwloc_inline hwloc_obj_t
-hwloc_get_hostbridge_by_pcibus(hwloc_topology_t topology,
-			       unsigned domain, unsigned bus)
-{
-  hwloc_obj_t obj = NULL;
-  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
-    if (hwloc_bridge_covers_pcibus(obj, domain, bus)) {
-      /* found bridge covering this pcibus, make sure it's a hostbridge */
-      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST);
-      assert(obj->parent->type != HWLOC_OBJ_BRIDGE);
-      assert(obj->parent->cpuset);
-      return obj;
-    }
-  }
-  return NULL;
-}
-
 /** @} */
 
 
diff --git a/ext/hwloc/include/hwloc/inlines.h b/ext/hwloc/include/hwloc/inlines.h
index 7281750a8..494209ea6 100644
--- a/ext/hwloc/include/hwloc/inlines.h
+++ b/ext/hwloc/include/hwloc/inlines.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2018 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -38,7 +38,7 @@ hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
     if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
       return depth+1;
 
-  /* Shouldn't ever happen, as there is always a SYSTEM level with lower order and known depth.  */
+  /* Shouldn't ever happen, as there is always a Machine level with lower order and known depth.  */
   /* abort(); */
 }
 
@@ -67,7 +67,7 @@ hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
     return 0;
   if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
     return -1; /* FIXME: agregate nbobjs from different levels? */
-  return hwloc_get_nbobjs_by_depth(topology, depth);
+  return (int) hwloc_get_nbobjs_by_depth(topology, depth);
 }
 
 static __hwloc_inline hwloc_obj_t
@@ -82,7 +82,7 @@ hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigne
 }
 
 static __hwloc_inline hwloc_obj_t
-hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev)
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev)
 {
   if (!prev)
     return hwloc_get_obj_by_depth (topology, depth, 0);
@@ -111,33 +111,25 @@ static __hwloc_inline const char *
 hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
 {
   unsigned i;
-  for(i=0; i<obj->infos_count; i++)
-    if (!strcmp(obj->infos[i].name, name))
-      return obj->infos[i].value;
+  for(i=0; i<obj->infos_count; i++) {
+    struct hwloc_info_s *info = &obj->infos[i];
+    if (!strcmp(info->name, name))
+      return info->value;
+  }
   return NULL;
 }
 
-static __hwloc_inline void *
-hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
-{
-  void *p = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
-  if (p)
-    return p;
-  hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
-  p = hwloc_alloc(topology, len);
-  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
-    /* Enforce the binding by touching the data */
-    memset(p, 0, len);
-  return p;
-}
-
 static __hwloc_inline void *
 hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
 {
   void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
   if (p)
     return p;
-  hwloc_set_membind(topology, set, policy, flags);
+
+  if (hwloc_set_membind(topology, set, policy, flags) < 0)
+    /* hwloc_set_membind() takes care of ignoring errors if non-STRICT */
+    return NULL;
+
   p = hwloc_alloc(topology, len);
   if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
     /* Enforce the binding by touching the data */
diff --git a/ext/hwloc/include/hwloc/intel-mic.h b/ext/hwloc/include/hwloc/intel-mic.h
index d58237b3d..c504cd7e0 100644
--- a/ext/hwloc/include/hwloc/intel-mic.h
+++ b/ext/hwloc/include/hwloc/intel-mic.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2013 Inria.  All rights reserved.
+ * Copyright © 2013-2016 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -13,11 +13,13 @@
 #ifndef HWLOC_INTEL_MIC_H
 #define HWLOC_INTEL_MIC_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc/helper.h"
+
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
+
 #include <dirent.h>
 #include <string.h>
 #endif
@@ -64,7 +66,6 @@ hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_un
 #define HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX 128
 	char path[HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX];
 	DIR *sysdir = NULL;
-	FILE *sysfile = NULL;
 	struct dirent *dirent;
 	unsigned pcibus, pcidev, pcifunc;
 
@@ -81,17 +82,9 @@ hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_un
 	while ((dirent = readdir(sysdir)) != NULL) {
 		if (sscanf(dirent->d_name, "pci_%02x:%02x.%02x", &pcibus, &pcidev, &pcifunc) == 3) {
 			sprintf(path, "/sys/class/mic/mic%d/pci_%02x:%02x.%02x/local_cpus", idx, pcibus, pcidev, pcifunc);
-			sysfile = fopen(path, "r");
-			if (!sysfile) {
-				closedir(sysdir);
-				return -1;
-			}
-
-			hwloc_linux_parse_cpumap_file(sysfile, set);
-			if (hwloc_bitmap_iszero(set))
+			if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+			    || hwloc_bitmap_iszero(set))
 				hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-			fclose(sysfile);
 			break;
 		}
 	}
diff --git a/ext/hwloc/include/hwloc/linux-libnuma.h b/ext/hwloc/include/hwloc/linux-libnuma.h
index 0ce25910a..0e2cc19f7 100644
--- a/ext/hwloc/include/hwloc/linux-libnuma.h
+++ b/ext/hwloc/include/hwloc/linux-libnuma.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2017 Inria.  All rights reserved.
  * Copyright © 2009-2010, 2012 Université Bordeaux
  * See COPYING in top-level directory.
  */
@@ -15,7 +15,8 @@
 #ifndef HWLOC_LINUX_LIBNUMA_H
 #define HWLOC_LINUX_LIBNUMA_H
 
-#include <hwloc.h>
+#include "hwloc.h"
+
 #include <numa.h>
 
 
@@ -196,7 +197,7 @@ hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpu
   if (!bitmask)
     return NULL;
   while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL)
-    if (node->memory.local_memory)
+    if (node->attr->numanode.local_memory)
       numa_bitmask_setbit(bitmask, node->os_index);
   return bitmask;
 }
@@ -221,7 +222,7 @@ hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_no
   if (!bitmask)
     return NULL;
   while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
-    if (hwloc_bitmap_isset(nodeset, node->os_index) && node->memory.local_memory)
+    if (hwloc_bitmap_isset(nodeset, node->os_index) && node->attr->numanode.local_memory)
       numa_bitmask_setbit(bitmask, node->os_index);
   return bitmask;
 }
diff --git a/ext/hwloc/include/hwloc/linux.h b/ext/hwloc/include/hwloc/linux.h
index 4ddc90090..ecc86be3d 100644
--- a/ext/hwloc/include/hwloc/linux.h
+++ b/ext/hwloc/include/hwloc/linux.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2016 Inria.  All rights reserved.
  * Copyright © 2009-2011 Université Bordeaux
  * See COPYING in top-level directory.
  */
@@ -15,7 +15,8 @@
 #ifndef HWLOC_LINUX_H
 #define HWLOC_LINUX_H
 
-#include <hwloc.h>
+#include "hwloc.h"
+
 #include <stdio.h>
 
 
@@ -32,13 +33,6 @@ extern "C" {
  * @{
  */
 
-/** \brief Convert a linux kernel cpumap file \p file into hwloc CPU set.
- *
- * Might be used when reading CPU set from sysfs attributes such as topology
- * and caches for processors, or local_cpus for devices.
- */
-HWLOC_DECLSPEC int hwloc_linux_parse_cpumap_file(FILE *file, hwloc_cpuset_t set);
-
 /** \brief Bind a thread \p tid on cpus given in cpuset \p set
  *
  * The behavior is exactly the same as the Linux sched_setaffinity system call,
@@ -55,17 +49,26 @@ HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t
  * but uses a hwloc cpuset.
  *
  * \note This is equivalent to calling hwloc_get_proc_cpubind() with
- * HWLOC_CPUBIND_THREAD as flags.
+ * ::HWLOC_CPUBIND_THREAD as flags.
  */
 HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
 
 /** \brief Get the last physical CPU where thread \p tid ran.
  *
  * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
- * HWLOC_CPUBIND_THREAD as flags.
+ * ::HWLOC_CPUBIND_THREAD as flags.
  */
 HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology, pid_t tid, hwloc_bitmap_t set);
 
+/** \brief Convert a linux kernel cpumask file \p path into a hwloc bitmap \p set.
+ *
+ * Might be used when reading CPU set from sysfs attributes such as topology
+ * and caches for processors, or local_cpus for devices.
+ *
+ * \note This function ignores the HWLOC_FSROOT environment variable.
+ */
+HWLOC_DECLSPEC int hwloc_linux_read_path_as_cpumask(const char *path, hwloc_bitmap_t set);
+
 /** @} */
 
 
@@ -74,4 +77,4 @@ HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topolo
 #endif
 
 
-#endif /* HWLOC_GLIBC_SCHED_H */
+#endif /* HWLOC_LINUX_H */
diff --git a/ext/hwloc/include/hwloc/myriexpress.h b/ext/hwloc/include/hwloc/myriexpress.h
deleted file mode 100644
index 68ff88f5a..000000000
--- a/ext/hwloc/include/hwloc/myriexpress.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright © 2010-2014 Inria.  All rights reserved.
- * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
- * See COPYING in top-level directory.
- */
-
-/** \file
- * \brief Macros to help interaction between hwloc and Myrinet Express.
- *
- * Applications that use both hwloc and Myrinet Express verbs may want to
- * include this file so as to get topology information for Myrinet hardware.
- *
- */
-
-#ifndef HWLOC_MYRIEXPRESS_H
-#define HWLOC_MYRIEXPRESS_H
-
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-
-#include <myriexpress.h>
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/** \defgroup hwlocality_myriexpress Interoperability with Myrinet Express
- *
- * This interface offers ways to retrieve topology information about
- * Myrinet Express hardware.
- *
- * @{
- */
-
-/** \brief Get the CPU set of logical processors that are physically
- * close the MX board \p id.
- *
- * Return the CPU set describing the locality of the Myrinet Express
- * board whose index is \p id.
- *
- * Topology \p topology and device \p id must match the local machine.
- * I/O devices detection is not needed in the topology.
- *
- * The function only returns the locality of the device.
- * No additional information about the device is available.
- */
-static __hwloc_inline int
-hwloc_mx_board_get_device_cpuset(hwloc_topology_t topology,
-				 unsigned id, hwloc_cpuset_t set)
-{
-  uint32_t in, out;
-
-  if (!hwloc_topology_is_thissystem(topology)) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  in = id;
-  if (mx_get_info(NULL, MX_NUMA_NODE, &in, sizeof(in), &out, sizeof(out)) != MX_SUCCESS) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  if (out != (uint32_t) -1) {
-    hwloc_obj_t obj = NULL;
-    while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
-      if (obj->os_index == out) {
-	hwloc_bitmap_copy(set, obj->cpuset);
-	goto out;
-      }
-  }
-  /* fallback to the full topology cpuset */
-  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
- out:
-  return 0;
-}
-
-/** \brief Get the CPU set of logical processors that are physically
- * close the MX endpoint \p endpoint.
- *
- * Return the CPU set describing the locality of the Myrinet Express
- * board that runs the MX endpoint \p endpoint.
- *
- * Topology \p topology and device \p id must match the local machine.
- * I/O devices detection is not needed in the topology.
- *
- * The function only returns the locality of the endpoint.
- * No additional information about the endpoint or device is available.
- */
-static __hwloc_inline int
-hwloc_mx_endpoint_get_device_cpuset(hwloc_topology_t topology,
-				    mx_endpoint_t endpoint, hwloc_cpuset_t set)
-{
-  uint64_t nid;
-  uint32_t nindex, eid;
-  mx_endpoint_addr_t eaddr;
-
-  if (mx_get_endpoint_addr(endpoint, &eaddr) != MX_SUCCESS) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  if (mx_decompose_endpoint_addr(eaddr, &nid, &eid) != MX_SUCCESS) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  if (mx_nic_id_to_board_number(nid, &nindex) != MX_SUCCESS) {
-    errno = EINVAL;
-    return -1;
-  }
-
-  return hwloc_mx_board_get_device_cpuset(topology, nindex, set);
-}
-
-/** @} */
-
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-
-#endif /* HWLOC_MYRIEXPRESS_H */
diff --git a/ext/hwloc/include/hwloc/nvml.h b/ext/hwloc/include/hwloc/nvml.h
index 462b33266..1bc2599f6 100644
--- a/ext/hwloc/include/hwloc/nvml.h
+++ b/ext/hwloc/include/hwloc/nvml.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012-2013 Inria.  All rights reserved.
+ * Copyright © 2012-2016 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -13,11 +13,11 @@
 #ifndef HWLOC_NVML_H
 #define HWLOC_NVML_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc/helper.h"
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
 #endif
 
 #include <nvml.h>
@@ -60,7 +60,6 @@ hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
   /* If we're on Linux, use the sysfs mechanism to get the local cpus */
 #define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
   char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
-  FILE *sysfile = NULL;
   nvmlReturn_t nvres;
   nvmlPciInfo_t pci;
 
@@ -76,15 +75,9 @@ hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
   }
 
   sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
-  sysfile = fopen(path, "r");
-  if (!sysfile)
-    return -1;
-
-  hwloc_linux_parse_cpumap_file(sysfile, set);
-  if (hwloc_bitmap_iszero(set))
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
     hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-  fclose(sysfile);
 #else
   /* Non-Linux systems simply get a full cpuset */
   hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
@@ -103,7 +96,7 @@ hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
  * I/O devices detection and the NVML component must be enabled in the topology.
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
@@ -130,7 +123,7 @@ hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
  * hwloc_nvml_get_device_cpuset().
  *
  * \note The corresponding hwloc PCI device may be found by looking
- * at the result parent pointer.
+ * at the result parent pointer (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
@@ -138,6 +131,7 @@ hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
 	hwloc_obj_t osdev;
 	nvmlReturn_t nvres;
 	nvmlPciInfo_t pci;
+	char uuid[64];
 
 	if (!hwloc_topology_is_thissystem(topology)) {
 		errno = EINVAL;
@@ -148,11 +142,18 @@ hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
 	if (NVML_SUCCESS != nvres)
 		return NULL;
 
+	nvres = nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
+	if (NVML_SUCCESS != nvres)
+		uuid[0] = '\0';
+
 	osdev = NULL;
 	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
 		hwloc_obj_t pcidev = osdev->parent;
+		const char *info;
+
 		if (strncmp(osdev->name, "nvml", 4))
 			continue;
+
 		if (pcidev
 		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
 		    && pcidev->attr->pcidev.domain == pci.domain
@@ -160,6 +161,10 @@ hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
 		    && pcidev->attr->pcidev.dev == pci.device
 		    && pcidev->attr->pcidev.func == 0)
 			return osdev;
+
+		info = hwloc_obj_get_info_by_name(osdev, "NVIDIAUUID");
+		if (info && !strcmp(info, uuid))
+			return osdev;
 	}
 
 	return NULL;
diff --git a/ext/hwloc/include/hwloc/opencl.h b/ext/hwloc/include/hwloc/opencl.h
index 0301ad988..ebf09848f 100644
--- a/ext/hwloc/include/hwloc/opencl.h
+++ b/ext/hwloc/include/hwloc/opencl.h
@@ -1,6 +1,6 @@
 /*
- * Copyright © 2012-2013 Inria.  All rights reserved.
- * Copyright © 2013 Université Bordeaux.  All right reserved.
+ * Copyright © 2012-2018 Inria.  All rights reserved.
+ * Copyright © 2013, 2018 Université Bordeaux.  All right reserved.
  * See COPYING in top-level directory.
  */
 
@@ -14,15 +14,18 @@
 #ifndef HWLOC_OPENCL_H
 #define HWLOC_OPENCL_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
-#include <hwloc/helper.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
+#include "hwloc/helper.h"
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
 #endif
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
-#include <CL/cl_ext.h>
+#endif
 
 #include <stdio.h>
 
@@ -32,17 +35,75 @@ extern "C" {
 #endif
 
 
+/* OpenCL extensions aren't always shipped with default headers, and
+ * they don't always reflect what the installed implementations support.
+ * Try everything and let the implementation return errors when non supported.
+ */
+/* Copyright (c) 2008-2018 The Khronos Group Inc. */
+
+/* needs "cl_amd_device_attribute_query" device extension, but not strictly required for clGetDeviceInfo() */
+#define HWLOC_CL_DEVICE_TOPOLOGY_AMD 0x4037
+typedef union {
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} hwloc_cl_device_topology_amd;
+#define HWLOC_CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1
+
+/* needs "cl_nv_device_attribute_query" device extension, but not strictly required for clGetDeviceInfo() */
+#define HWLOC_CL_DEVICE_PCI_BUS_ID_NV 0x4008
+#define HWLOC_CL_DEVICE_PCI_SLOT_ID_NV 0x4009
+
+
 /** \defgroup hwlocality_opencl Interoperability with OpenCL
  *
  * This interface offers ways to retrieve topology information about
  * OpenCL devices.
  *
- * Only the AMD OpenCL interface currently offers useful locality information
- * about its devices.
+ * Only AMD and NVIDIA OpenCL implementations currently offer useful locality
+ * information about their devices.
  *
  * @{
  */
 
+/** \brief Return the domain, bus and device IDs of the OpenCL device \p device.
+ *
+ * Device \p device must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_opencl_get_device_pci_busid(cl_device_id device,
+                               unsigned *domain, unsigned *bus, unsigned *dev, unsigned *func)
+{
+	hwloc_cl_device_topology_amd amdtopo;
+	cl_uint nvbus, nvslot;
+	cl_int clret;
+
+	clret = clGetDeviceInfo(device, HWLOC_CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS == clret
+	    && HWLOC_CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD == amdtopo.raw.type) {
+		*domain = 0; /* can't do anything better */
+		*bus = (unsigned) amdtopo.pcie.bus;
+		*dev = (unsigned) amdtopo.pcie.device;
+		*func = (unsigned) amdtopo.pcie.function;
+		return 0;
+	}
+
+	clret = clGetDeviceInfo(device, HWLOC_CL_DEVICE_PCI_BUS_ID_NV, sizeof(nvbus), &nvbus, NULL);
+	if (CL_SUCCESS == clret) {
+		clret = clGetDeviceInfo(device, HWLOC_CL_DEVICE_PCI_SLOT_ID_NV, sizeof(nvslot), &nvslot, NULL);
+		if (CL_SUCCESS == clret) {
+			/* FIXME: PCI bus only uses 8bit, assume nvidia hardcodes the domain in higher bits */
+			*domain = nvbus >> 8;
+			*bus = nvbus & 0xff;
+			/* non-documented but used in many other projects */
+			*dev = nvslot >> 3;
+			*func = nvslot & 0x7;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
 /** \brief Get the CPU set of logical processors that are physically
  * close to OpenCL device \p device.
  *
@@ -57,7 +118,7 @@ extern "C" {
  * and hwloc_opencl_get_device_osdev_by_index().
  *
  * This function is currently only implemented in a meaningful way for
- * Linux with the AMD OpenCL implementation; other systems will simply
+ * Linux with the AMD or NVIDIA OpenCL implementation; other systems will simply
  * get a full cpuset.
  */
 static __hwloc_inline int
@@ -65,41 +126,28 @@ hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unuse
 			       cl_device_id device __hwloc_attribute_unused,
 			       hwloc_cpuset_t set)
 {
-#if (defined HWLOC_LINUX_SYS) && (defined CL_DEVICE_TOPOLOGY_AMD)
-	/* If we're on Linux + AMD OpenCL, use the AMD extension + the sysfs mechanism to get the local cpus */
+#if (defined HWLOC_LINUX_SYS)
+	/* If we're on Linux, try AMD/NVIDIA extensions + the sysfs mechanism to get the local cpus */
 #define HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX 128
 	char path[HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX];
-	FILE *sysfile = NULL;
-	cl_device_topology_amd amdtopo;
-	cl_int clret;
+	unsigned pcidomain, pcibus, pcidev, pcifunc;
 
 	if (!hwloc_topology_is_thissystem(topology)) {
 		errno = EINVAL;
 		return -1;
 	}
 
-	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
-	if (CL_SUCCESS != clret) {
+	if (hwloc_opencl_get_device_pci_busid(device, &pcidomain, &pcibus, &pcidev, &pcifunc) < 0) {
 		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
 		return 0;
 	}
-	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
-		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-		return 0;
-	}
-
-	sprintf(path, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/local_cpus", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
-	sysfile = fopen(path, "r");
-	if (!sysfile)
-		return -1;
 
-	hwloc_linux_parse_cpumap_file(sysfile, set);
-	if (hwloc_bitmap_iszero(set))
+	sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus", pcidomain, pcibus, pcidev, pcifunc);
+	if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+	    || hwloc_bitmap_iszero(set))
 		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-	fclose(sysfile);
 #else
-	/* Non-Linux + AMD OpenCL systems simply get a full cpuset */
+	/* Non-Linux systems simply get a full cpuset */
 	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
 #endif
   return 0;
@@ -118,7 +166,7 @@ hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unuse
  * I/O devices detection and the OpenCL component must be enabled in the topology.
  *
  * \note The corresponding PCI device object can be obtained by looking
- * at the OS device parent object.
+ * at the OS device parent object (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
@@ -136,34 +184,34 @@ hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
         return NULL;
 }
 
-/** \brief Get the hwloc OS device object corresponding to OpenCL device \p device.
+/** \brief Get the hwloc OS device object corresponding to OpenCL device \p deviceX.
  *
- * Return the hwloc OS device object that describes the given
- * OpenCL device \p device. Return NULL if there is none.
+ * Use OpenCL device attributes to find the corresponding hwloc OS device object.
+ * Return NULL if there is none or if useful attributes are not available.
+ *
+ * This function currently only works on AMD and NVIDIA OpenCL devices that support
+ * relevant OpenCL extensions. hwloc_opencl_get_device_osdev_by_index()
+ * should be preferred whenever possible, i.e. when platform and device index
+ * are known.
  *
  * Topology \p topology and device \p device must match the local machine.
  * I/O devices detection and the OpenCL component must be enabled in the topology.
  * If not, the locality of the object may still be found using
  * hwloc_opencl_get_device_cpuset().
  *
+ * \note This function cannot work if PCI devices are filtered out.
+ *
  * \note The corresponding hwloc PCI device may be found by looking
- * at the result parent pointer.
+ * at the result parent pointer (unless PCI devices are filtered out).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
 			      cl_device_id device __hwloc_attribute_unused)
 {
-#ifdef CL_DEVICE_TOPOLOGY_AMD
 	hwloc_obj_t osdev;
-	cl_device_topology_amd amdtopo;
-	cl_int clret;
+	unsigned pcidomain, pcibus, pcidevice, pcifunc;
 
-	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
-	if (CL_SUCCESS != clret) {
-		errno = EINVAL;
-		return NULL;
-	}
-	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+	if (hwloc_opencl_get_device_pci_busid(device, &pcidomain, &pcibus, &pcidevice, &pcifunc) < 0) {
 		errno = EINVAL;
 		return NULL;
 	}
@@ -175,17 +223,15 @@ hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused
 			continue;
 		if (pcidev
 		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
-		    && pcidev->attr->pcidev.domain == 0
-		    && pcidev->attr->pcidev.bus == amdtopo.pcie.bus
-		    && pcidev->attr->pcidev.dev == amdtopo.pcie.device
-		    && pcidev->attr->pcidev.func == amdtopo.pcie.function)
+		    && pcidev->attr->pcidev.domain == pcidomain
+		    && pcidev->attr->pcidev.bus == pcibus
+		    && pcidev->attr->pcidev.dev == pcidevice
+		    && pcidev->attr->pcidev.func == pcifunc)
 			return osdev;
+		/* if PCI are filtered out, we need a info attr to match on */
 	}
 
 	return NULL;
-#else
-	return NULL;
-#endif
 }
 
 /** @} */
diff --git a/ext/hwloc/include/hwloc/openfabrics-verbs.h b/ext/hwloc/include/hwloc/openfabrics-verbs.h
index c6b853374..d247a8b1c 100644
--- a/ext/hwloc/include/hwloc/openfabrics-verbs.h
+++ b/ext/hwloc/include/hwloc/openfabrics-verbs.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2016 Inria.  All rights reserved.
  * Copyright © 2009-2010 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -12,17 +12,17 @@
  *
  * Applications that use both hwloc and OpenFabrics verbs may want to
  * include this file so as to get topology information for OpenFabrics
- * hardware.
+ * hardware (InfiniBand, etc).
  *
  */
 
 #ifndef HWLOC_OPENFABRICS_VERBS_H
 #define HWLOC_OPENFABRICS_VERBS_H
 
-#include <hwloc.h>
-#include <hwloc/autogen/config.h>
+#include "hwloc.h"
+#include "hwloc/autogen/config.h"
 #ifdef HWLOC_LINUX_SYS
-#include <hwloc/linux.h>
+#include "hwloc/linux.h"
 #endif
 
 #include <infiniband/verbs.h>
@@ -36,7 +36,7 @@ extern "C" {
 /** \defgroup hwlocality_openfabrics Interoperability with OpenFabrics
  *
  * This interface offers ways to retrieve topology information about
- * OpenFabrics devices.
+ * OpenFabrics devices (InfiniBand, Omni-Path, usNIC, etc).
  *
  * @{
  */
@@ -45,7 +45,7 @@ extern "C" {
  * close to device \p ibdev.
  *
  * Return the CPU set describing the locality of the OpenFabrics
- * device \p ibdev.
+ * device \p ibdev (InfiniBand, etc).
  *
  * Topology \p topology and device \p ibdev must match the local machine.
  * I/O devices detection is not needed in the topology.
@@ -67,7 +67,6 @@ hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
      get the local cpus */
 #define HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX 128
   char path[HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX];
-  FILE *sysfile = NULL;
 
   if (!hwloc_topology_is_thissystem(topology)) {
     errno = EINVAL;
@@ -76,15 +75,9 @@ hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
 
   sprintf(path, "/sys/class/infiniband/%s/device/local_cpus",
 	  ibv_get_device_name(ibdev));
-  sysfile = fopen(path, "r");
-  if (!sysfile)
-    return -1;
-
-  hwloc_linux_parse_cpumap_file(sysfile, set);
-  if (hwloc_bitmap_iszero(set))
+  if (hwloc_linux_read_path_as_cpumask(path, set) < 0
+      || hwloc_bitmap_iszero(set))
     hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
-
-  fclose(sysfile);
 #else
   /* Non-Linux systems simply get a full cpuset */
   hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
@@ -95,8 +88,10 @@ hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
 /** \brief Get the hwloc OS device object corresponding to the OpenFabrics
  * device named \p ibname.
  *
- * Return the OS device object describing the OpenFabrics device whose
- * name is \p ibname. Returns NULL if there is none.
+ * Return the OS device object describing the OpenFabrics device
+ * (InfiniBand, Omni-Path, usNIC, etc) whose name is \p ibname
+ * (mlx5_0, hfi1_0, usnic_0, qib0, etc).
+ * Returns NULL if there is none.
  * The name \p ibname is usually obtained from ibv_get_device_name().
  *
  * The topology \p topology does not necessarily have to match the current
@@ -122,8 +117,8 @@ hwloc_ibv_get_device_osdev_by_name(hwloc_topology_t topology,
 /** \brief Get the hwloc OS device object corresponding to the OpenFabrics
  * device \p ibdev.
  *
- * Return the OS device object describing the OpenFabrics device \p ibdev.
- * Returns NULL if there is none.
+ * Return the OS device object describing the OpenFabrics device \p ibdev
+ * (InfiniBand, etc). Returns NULL if there is none.
  *
  * Topology \p topology and device \p ibdev must match the local machine.
  * I/O devices detection must be enabled in the topology.
diff --git a/ext/hwloc/include/hwloc/plugins.h b/ext/hwloc/include/hwloc/plugins.h
index 7fc794d25..0f53ac4d4 100644
--- a/ext/hwloc/include/hwloc/plugins.h
+++ b/ext/hwloc/include/hwloc/plugins.h
@@ -1,5 +1,6 @@
 /*
- * Copyright © 2013-2015 Inria.  All rights reserved.
+ * Copyright © 2013-2019 Inria.  All rights reserved.
+ * Copyright © 2016 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -12,7 +13,8 @@
 
 struct hwloc_backend;
 
-#include <hwloc.h>
+#include "hwloc.h"
+
 #ifdef HWLOC_INSIDE_PLUGIN
 /* needed for hwloc_plugin_check_namespace() */
 #include <ltdl.h>
@@ -24,52 +26,36 @@ struct hwloc_backend;
  * @{
  */
 
-/** \brief Discovery component type */
-typedef enum hwloc_disc_component_type_e {
-  /** \brief CPU-only discovery through the OS, or generic no-OS support.
-   * \hideinitializer */
-  HWLOC_DISC_COMPONENT_TYPE_CPU = (1<<0),
-
-  /** \brief xml or synthetic,
-   * platform-specific components such as bgq.
-   * Anything the discovers CPU and everything else.
-   * No misc backend is expected to complement a global component.
-   * \hideinitializer */
-  HWLOC_DISC_COMPONENT_TYPE_GLOBAL = (1<<1),
-
-  /** \brief OpenCL, Cuda, etc.
-   * \hideinitializer */
-  HWLOC_DISC_COMPONENT_TYPE_MISC = (1<<2)
-} hwloc_disc_component_type_t;
-
 /** \brief Discovery component structure
  *
  * This is the major kind of components, taking care of the discovery.
  * They are registered by generic components, either statically-built or as plugins.
  */
 struct hwloc_disc_component {
-  /** \brief Discovery component type */
-  hwloc_disc_component_type_t type;
-
   /** \brief Name.
    * If this component is built as a plugin, this name does not have to match the plugin filename.
    */
   const char *name;
 
-  /** \brief Component types to exclude, as an OR'ed set of HWLOC_DISC_COMPONENT_TYPE_*.
+  /** \brief Discovery phases performed by this component.
+   * OR'ed set of ::hwloc_disc_phase_t
+   */
+  unsigned phases;
+
+  /** \brief Component phases to exclude, as an OR'ed set of ::hwloc_disc_phase_t.
    *
-   * For a GLOBAL component, this usually includes all other types (~0).
+   * For a GLOBAL component, this usually includes all other phases (\c ~UL).
    *
    * Other components only exclude types that may bring conflicting
    * topology information. MISC components should likely not be excluded
    * since they usually bring non-primary additional information.
    */
-  unsigned excludes;
+  unsigned excluded_phases;
 
   /** \brief Instantiate callback to create a backend from the component.
    * Parameters data1, data2, data3 are NULL except for components
    * that have special enabling routines such as hwloc_topology_set_xml(). */
-  struct hwloc_backend * (*instantiate)(struct hwloc_disc_component *component, const void *data1, const void *data2, const void *data3);
+  struct hwloc_backend * (*instantiate)(struct hwloc_topology *topology, struct hwloc_disc_component *component, unsigned excluded_phases, const void *data1, const void *data2, const void *data3);
 
   /** \brief Component priority.
    * Used to sort topology->components, higher priority first.
@@ -85,6 +71,11 @@ struct hwloc_disc_component {
    */
   unsigned priority;
 
+  /** \brief Enabled by default.
+   * If unset, if will be disabled unless explicitly requested.
+   */
+  unsigned enabled_by_default;
+
   /** \private Used internally to list components by priority on topology->components
    * (the component structure is usually read-only,
    *  the core copies it before using this field for queueing)
@@ -101,6 +92,72 @@ struct hwloc_disc_component {
  * @{
  */
 
+/** \brief Discovery phase */
+typedef enum hwloc_disc_phase_e {
+  /** \brief xml or synthetic, platform-specific components such as bgq.
+   * Discovers everything including CPU, memory, I/O and everything else.
+   * A component with a Global phase usually excludes all other phases.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_GLOBAL = (1U<<0),
+
+  /** \brief CPU discovery.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_CPU = (1U<<1),
+
+  /** \brief Attach memory to existing CPU objects.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_MEMORY = (1U<<2),
+
+  /** \brief Attach PCI devices and bridges to existing CPU objects.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_PCI = (1U<<3),
+
+  /** \brief I/O discovery that requires PCI devices (OS devices such as OpenCL, CUDA, etc.).
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_IO = (1U<<4),
+
+  /** \brief Misc objects that gets added below anything else.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_MISC = (1U<<5),
+
+  /** \brief Annotating existing objects, adding distances, etc.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_ANNOTATE = (1U<<6),
+
+  /** \brief Final tweaks to a ready-to-use topology.
+   * This phase runs once the topology is loaded, before it is returned to the topology.
+   * Hence it may only use the main hwloc API for modifying the topology,
+   * for instance by restricting it, adding info attributes, etc.
+   * \hideinitializer */
+  HWLOC_DISC_PHASE_TWEAK = (1U<<7)
+} hwloc_disc_phase_t;
+
+/** \brief Discovery status flags */
+enum hwloc_disc_status_flag_e {
+  /** \brief The sets of allowed resources were already retrieved \hideinitializer */
+  HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES = (1UL<<1)
+};
+
+/** \brief Discovery status structure
+ *
+ * Used by the core and backends to inform about what has been/is being done
+ * during the discovery process.
+ */
+struct hwloc_disc_status {
+  /** \brief The current discovery phase that is performed.
+   * Must match one of the phases in the component phases field.
+   */
+  hwloc_disc_phase_t phase;
+
+  /** \brief Dynamically excluded phases.
+   * If a component decides during discovery that some phases are no longer needed.
+   */
+  unsigned excluded_phases;
+
+  /** \brief OR'ed set of hwloc_disc_status_flag_e */
+  unsigned long flags;
+};
+
 /** \brief Discovery backend structure
  *
  * A backend is the instantiation of a discovery component.
@@ -110,6 +167,14 @@ struct hwloc_disc_component {
  * hwloc_backend_alloc() initializes all fields to default values
  * that the component may change (except "component" and "next")
  * before enabling the backend with hwloc_backend_enable().
+ *
+ * Most backends assume that the topology is_thissystem flag is
+ * set because they talk to the underlying operating system.
+ * However they may still be used in topologies without the
+ * is_thissystem flag for debugging reasons.
+ * In practice, they are usually auto-disabled in such cases
+ * (excluded by xml or synthetic backends, or by environment
+ *  variables when changing the Linux fsroot or the x86 cpuid path).
  */
 struct hwloc_backend {
   /** \private Reserved for the core, set by hwloc_backend_alloc() */
@@ -121,12 +186,20 @@ struct hwloc_backend {
   /** \private Reserved for the core. Used internally to list backends topology->backends. */
   struct hwloc_backend * next;
 
-  /** \brief Backend flags, as an OR'ed set of HWLOC_BACKEND_FLAG_* */
+  /** \brief Discovery phases performed by this component, possibly without some of them if excluded by other components.
+   * OR'ed set of ::hwloc_disc_phase_t
+   */
+  unsigned phases;
+
+  /** \brief Backend flags, currently always 0. */
   unsigned long flags;
 
   /** \brief Backend-specific 'is_thissystem' property.
-   * Set to 0 or 1 if the backend should enforce the thissystem flag when it gets enabled.
-   * Set to -1 if the backend doesn't care (default). */
+   * Set to 0 if the backend disables the thissystem flag for this topology
+   * (e.g. loading from xml or synthetic string,
+   *  or using a different fsroot on Linux, or a x86 CPUID dump).
+   * Set to -1 if the backend doesn't care (default).
+   */
   int is_thissystem;
 
   /** \brief Backend private data, or NULL if none. */
@@ -137,52 +210,26 @@ struct hwloc_backend {
   void (*disable)(struct hwloc_backend *backend);
 
   /** \brief Main discovery callback.
-   * returns > 0 if it modified the topology tree, -1 on error, 0 otherwise.
-   * May be NULL if type is HWLOC_DISC_COMPONENT_TYPE_MISC. */
-  int (*discover)(struct hwloc_backend *backend);
-
-  /** \brief Callback used by the PCI backend to retrieve the locality of a PCI object from the OS/cpu backend.
-   * May be NULL. */
-  int (*get_obj_cpuset)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
-
-  /** \brief Callback called by backends to notify this backend that a new object was added.
-   * returns > 0 if it modified the topology tree, 0 otherwise.
-   * May be NULL. */
-  int (*notify_new_object)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj);
-};
+   * returns -1 on error, either because it couldn't add its objects ot the existing topology,
+   * or because of an actual discovery/gathering failure.
+   * May be NULL.
+   */
+  int (*discover)(struct hwloc_backend *backend, struct hwloc_disc_status *status);
 
-/** \brief Backend flags */
-enum hwloc_backend_flag_e {
-  /** \brief Levels should be reconnected before this backend discover() is used.
-   * \hideinitializer */
-  HWLOC_BACKEND_FLAG_NEED_LEVELS = (1UL<<0)
+  /** \brief Callback to retrieve the locality of a PCI object.
+   * Called by the PCI core when attaching PCI hierarchy to CPU objects.
+   * May be NULL.
+   */
+  int (*get_pci_busid_cpuset)(struct hwloc_backend *backend, struct hwloc_pcidev_attr_s *busid, hwloc_bitmap_t cpuset);
 };
 
 /** \brief Allocate a backend structure, set good default values, initialize backend->component and topology, etc.
  * The caller will then modify whatever needed, and call hwloc_backend_enable().
  */
-HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_disc_component *component);
+HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_topology *topology, struct hwloc_disc_component *component);
 
 /** \brief Enable a previously allocated and setup backend. */
-HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend);
-
-/** \brief Used by backends discovery callbacks to request locality information from others.
- *
- * Traverse the list of enabled backends until one has a
- * get_obj_cpuset() method, and call it.
- */
-HWLOC_DECLSPEC int hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
-
-/** \brief Used by backends discovery callbacks to notify other
- * backends of new objects.
- *
- * Traverse the list of enabled backends (all but caller) and invoke
- * their notify_new_object() method to notify them that a new object
- * just got added to the topology.
- *
- * Currently only used for notifying of new PCI device objects.
- */
-HWLOC_DECLSPEC int hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj);
+HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_backend *backend);
 
 /** @} */
 
@@ -208,7 +255,7 @@ typedef enum hwloc_component_type_e {
  * or dynamically loaded as a plugin.
  */
 struct hwloc_component {
-  /** \brief Component ABI version, set to HWLOC_COMPONENT_ABI */
+  /** \brief Component ABI version, set to ::HWLOC_COMPONENT_ABI */
   unsigned abi;
 
   /** \brief Process-wide component initialization callback.
@@ -278,6 +325,10 @@ struct hwloc_component {
  *
  * In case of error, hwloc_report_os_error() is called.
  *
+ * The caller should check whether the object type is filtered-out before calling this function.
+ *
+ * The topology cpuset/nodesets will be enlarged to include the object sets.
+ *
  * Returns the object on success.
  * Returns NULL and frees obj on error.
  * Returns another object and frees obj if it was merged with an identical pre-existing object.
@@ -293,9 +344,11 @@ HWLOC_DECLSPEC int hwloc_hide_errors(void);
 
 /** \brief Add an object to the topology and specify which error callback to use.
  *
- * Aside from the error callback selection, this function is identical to hwloc_insert_object_by_cpuset()
+ * This function is similar to hwloc_insert_object_by_cpuset() but it allows specifying
+ * where to start insertion from (if \p root is NULL, the topology root object is used),
+ * and specifying the error callback.
  */
-HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj, hwloc_report_error_t report_error);
+HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t root, hwloc_obj_t obj, hwloc_report_error_t report_error);
 
 /** \brief Insert an object somewhere in the topology.
  *
@@ -310,22 +363,16 @@ HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_top
  *
  * The given object may have normal, I/O or Misc children, as long as they are in order as well.
  * These children must have valid parent and next_sibling pointers.
+ *
+ * The caller should check whether the object type is filtered-out before calling this function.
  */
 HWLOC_DECLSPEC void hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj);
 
-/** \brief Allocate and initialize an object of the given type and physical index */
-static __hwloc_inline struct hwloc_obj *
-hwloc_alloc_setup_object(hwloc_obj_type_t type, signed os_index)
-{
-  struct hwloc_obj *obj = malloc(sizeof(*obj));
-  memset(obj, 0, sizeof(*obj));
-  obj->type = type;
-  obj->os_index = os_index;
-  obj->attr = malloc(sizeof(*obj->attr));
-  memset(obj->attr, 0, sizeof(*obj->attr));
-  /* do not allocate the cpuset here, let the caller do it */
-  return obj;
-}
+/** \brief Allocate and initialize an object of the given type and physical index.
+ *
+ * If \p os_index is unknown or irrelevant, use \c HWLOC_UNKNOWN_INDEX.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_alloc_setup_object(hwloc_topology_t topology, hwloc_obj_type_t type, unsigned os_index);
 
 /** \brief Setup object cpusets/nodesets by OR'ing its children.
  *
@@ -337,6 +384,15 @@ hwloc_alloc_setup_object(hwloc_obj_type_t type, signed os_index)
  */
 HWLOC_DECLSPEC int hwloc_obj_add_children_sets(hwloc_obj_t obj);
 
+/** \brief Request a reconnection of children and levels in the topology.
+ *
+ * May be used by backends during discovery if they need arrays or lists
+ * of object within levels or children to be fully connected.
+ *
+ * \p flags is currently unused, must 0.
+ */
+HWLOC_DECLSPEC int hwloc_topology_reconnect(hwloc_topology_t topology, unsigned long flags __hwloc_attribute_unused);
+
 /** \brief Make sure that plugins can lookup core symbols.
  *
  * This is a sanity check to avoid lazy-lookup failures when libhwloc
@@ -375,7 +431,7 @@ hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, co
     static int verboseenv_value = 0;
     if (!verboseenv_checked) {
       const char *verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
-      verboseenv_value = atoi(verboseenv);
+      verboseenv_value = verboseenv ? atoi(verboseenv) : 0;
       verboseenv_checked = 1;
     }
     if (verboseenv_value)
@@ -392,38 +448,141 @@ hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, co
 
 
 
-/** \defgroup hwlocality_components_pci_funcs Components and Plugins: PCI functions to be used by components
+/** \defgroup hwlocality_components_filtering Components and Plugins: Filtering objects
  * @{
  */
 
-/** \brief Insert a list of PCI devices and bridges in the backend topology.
+/** \brief Check whether the given PCI device classid is important.
+ *
+ * \return 1 if important, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_pcidev_subtype_important(unsigned classid)
+{
+  unsigned baseclass = classid >> 8;
+  return (baseclass == 0x03 /* PCI_BASE_CLASS_DISPLAY */
+	  || baseclass == 0x02 /* PCI_BASE_CLASS_NETWORK */
+	  || baseclass == 0x01 /* PCI_BASE_CLASS_STORAGE */
+	  || baseclass == 0x0b /* PCI_BASE_CLASS_PROCESSOR */
+	  || classid == 0x0c04 /* PCI_CLASS_SERIAL_FIBER */
+	  || classid == 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
+	  || baseclass == 0x12 /* Processing Accelerators */);
+}
+
+/** \brief Check whether the given OS device subtype is important.
+ *
+ * \return 1 if important, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_osdev_subtype_important(hwloc_obj_osdev_type_t subtype)
+{
+  return (subtype != HWLOC_OBJ_OSDEV_DMA);
+}
+
+/** \brief Check whether a non-I/O object type should be filtered-out.
+ *
+ * Cannot be used for I/O objects.
  *
- * Insert a list of objects (either PCI device or bridges) starting at first_obj
- * (linked by next_sibling in the topology, and ending with NULL).
- * Objects are placed under the right bridges, and the remaining upstream bridges
- * are then inserted in the topology by calling the get_obj_cpuset() callback to
- * find their locality.
+ * \return 1 if the object type should be kept, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_keep_object_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  enum hwloc_type_filter_e filter = HWLOC_TYPE_FILTER_KEEP_NONE;
+  hwloc_topology_get_type_filter(topology, type, &filter);
+  assert(filter != HWLOC_TYPE_FILTER_KEEP_IMPORTANT); /* IMPORTANT only used for I/O */
+  return filter == HWLOC_TYPE_FILTER_KEEP_NONE ? 0 : 1;
+}
+
+/** \brief Check whether the given object should be filtered-out.
+ *
+ * \return 1 if the object type should be kept, 0 otherwise.
+ */
+static __hwloc_inline int
+hwloc_filter_check_keep_object(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_type_t type = obj->type;
+  enum hwloc_type_filter_e filter = HWLOC_TYPE_FILTER_KEEP_NONE;
+  hwloc_topology_get_type_filter(topology, type, &filter);
+  if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
+    return 0;
+  if (filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT) {
+    if (type == HWLOC_OBJ_PCI_DEVICE)
+      return hwloc_filter_check_pcidev_subtype_important(obj->attr->pcidev.class_id);
+    if (type == HWLOC_OBJ_OS_DEVICE)
+      return hwloc_filter_check_osdev_subtype_important(obj->attr->osdev.type);
+  }
+  return 1;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pcidisc Components and Plugins: helpers for PCI discovery
+ * @{
  */
-HWLOC_DECLSPEC int hwloc_insert_pci_device_list(struct hwloc_backend *backend, struct hwloc_obj *first_obj);
 
 /** \brief Return the offset of the given capability in the PCI config space buffer
  *
  * This function requires a 256-bytes config space. Unknown/unavailable bytes should be set to 0xff.
  */
-HWLOC_DECLSPEC unsigned hwloc_pci_find_cap(const unsigned char *config, unsigned cap);
+HWLOC_DECLSPEC unsigned hwloc_pcidisc_find_cap(const unsigned char *config, unsigned cap);
 
 /** \brief Fill linkspeed by reading the PCI config space where PCI_CAP_ID_EXP is at position offset.
  *
  * Needs 20 bytes of EXP capability block starting at offset in the config space
  * for registers up to link status.
  */
-HWLOC_DECLSPEC int hwloc_pci_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+HWLOC_DECLSPEC int hwloc_pcidisc_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+
+/** \brief Return the hwloc object type (PCI device or Bridge) for the given class and configuration space.
+ *
+ * This function requires 16 bytes of common configuration header at the beginning of config.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_pcidisc_check_bridge_type(unsigned device_class, const unsigned char *config);
+
+/** \brief Fills the attributes of the given PCI bridge using the given PCI config space.
+ *
+ * This function requires 32 bytes of common configuration header at the beginning of config.
+ *
+ * Returns -1 and destroys /p obj if bridge fields are invalid.
+ */
+HWLOC_DECLSPEC int hwloc_pcidisc_find_bridge_buses(unsigned domain, unsigned bus, unsigned dev, unsigned func,
+						   unsigned *secondary_busp, unsigned *subordinate_busp,
+						   const unsigned char *config);
 
-/** \brief Modify the PCI device object into a bridge and fill its attribute if a bridge is found in the PCI config space.
+/** \brief Insert a PCI object in the given PCI tree by looking at PCI bus IDs.
+ *
+ * If \p treep points to \c NULL, the new object is inserted there.
+ */
+HWLOC_DECLSPEC void hwloc_pcidisc_tree_insert_by_busid(struct hwloc_obj **treep, struct hwloc_obj *obj);
+
+/** \brief Add some hostbridges on top of the given tree of PCI objects and attach them to the topology.
+ *
+ * Other backends may lookup PCI objects or localities (for instance to attach OS devices)
+ * by using hwloc_pcidisc_find_by_busid() or hwloc_pcidisc_find_busid_parent().
+ */
+HWLOC_DECLSPEC int hwloc_pcidisc_tree_attach(struct hwloc_topology *topology, struct hwloc_obj *tree);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pcifind Components and Plugins: finding PCI objects during other discoveries
+ * @{
+ */
+
+/** \brief Find the normal parent of a PCI bus ID.
+ *
+ * Look at PCI affinity to find out where the given PCI bus ID should be attached.
  *
- * This function requires 64 bytes of common configuration header at the beginning of config.
+ * This function should be used to attach an I/O device under the corresponding
+ * PCI object (if any), or under a normal (non-I/O) object with same locality.
  */
-HWLOC_DECLSPEC int hwloc_pci_prepare_bridge(hwloc_obj_t obj, const unsigned char *config);
+HWLOC_DECLSPEC struct hwloc_obj * hwloc_pci_find_parent_by_busid(struct hwloc_topology *topology, unsigned domain, unsigned bus, unsigned dev, unsigned func);
 
 /** @} */
 
diff --git a/ext/hwloc/include/hwloc/rename.h b/ext/hwloc/include/hwloc/rename.h
index 2684e7123..a23738d0d 100644
--- a/ext/hwloc/include/hwloc/rename.h
+++ b/ext/hwloc/include/hwloc/rename.h
@@ -1,13 +1,13 @@
 /*
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
- * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2019 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
 #ifndef HWLOC_RENAME_H
 #define HWLOC_RENAME_H
 
-#include <hwloc/autogen/config.h>
+#include "hwloc/autogen/config.h"
 
 
 #ifdef __cplusplus
@@ -47,13 +47,21 @@ extern "C" {
 #define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
 #define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
 
-#define HWLOC_OBJ_SYSTEM HWLOC_NAME_CAPS(OBJ_SYSTEM)
 #define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
 #define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_MEMCACHE HWLOC_NAME_CAPS(OBJ_MEMCACHE)
 #define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
-#define HWLOC_OBJ_CACHE HWLOC_NAME_CAPS(OBJ_CACHE)
+#define HWLOC_OBJ_DIE HWLOC_NAME_CAPS(OBJ_DIE)
 #define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
 #define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_L1CACHE HWLOC_NAME_CAPS(OBJ_L1CACHE)
+#define HWLOC_OBJ_L2CACHE HWLOC_NAME_CAPS(OBJ_L2CACHE)
+#define HWLOC_OBJ_L3CACHE HWLOC_NAME_CAPS(OBJ_L3CACHE)
+#define HWLOC_OBJ_L4CACHE HWLOC_NAME_CAPS(OBJ_L4CACHE)
+#define HWLOC_OBJ_L5CACHE HWLOC_NAME_CAPS(OBJ_L5CACHE)
+#define HWLOC_OBJ_L1ICACHE HWLOC_NAME_CAPS(OBJ_L1ICACHE)
+#define HWLOC_OBJ_L2ICACHE HWLOC_NAME_CAPS(OBJ_L2ICACHE)
+#define HWLOC_OBJ_L3ICACHE HWLOC_NAME_CAPS(OBJ_L3ICACHE)
 #define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
 #define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
 #define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
@@ -87,16 +95,14 @@ extern "C" {
 #define hwloc_compare_types_e HWLOC_NAME(compare_types_e)
 #define HWLOC_TYPE_UNORDERED HWLOC_NAME_CAPS(TYPE_UNORDERED)
 
-#define hwloc_obj_memory_s HWLOC_NAME(obj_memory_s)
-#define hwloc_obj_memory_page_type_s HWLOC_NAME(obj_memory_page_type_s)
-
 #define hwloc_obj HWLOC_NAME(obj)
 #define hwloc_obj_t HWLOC_NAME(obj_t)
 
-#define hwloc_distances_s HWLOC_NAME(distances_s)
-#define hwloc_obj_info_s HWLOC_NAME(obj_info_s)
+#define hwloc_info_s HWLOC_NAME(info_s)
 
 #define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_numanode_attr_s HWLOC_NAME(numanode_attr_s)
+#define hwloc_memory_page_type_s HWLOC_NAME(memory_page_type_s)
 #define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
 #define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
 #define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
@@ -107,21 +113,22 @@ extern "C" {
 #define hwloc_topology_load HWLOC_NAME(topology_load)
 #define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
 #define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_abi_check HWLOC_NAME(topology_abi_check)
 #define hwloc_topology_check HWLOC_NAME(topology_check)
 
 #define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
 
-#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_SYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WITH_DISALLOWED)
 #define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
-#define HWLOC_TOPOLOGY_FLAG_IO_DEVICES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_DEVICES)
-#define HWLOC_TOPOLOGY_FLAG_IO_BRIDGES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_BRIDGES)
-#define HWLOC_TOPOLOGY_FLAG_WHOLE_IO HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_IO)
-#define HWLOC_TOPOLOGY_FLAG_ICACHES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_ICACHES)
+#define HWLOC_TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_THISSYSTEM_ALLOWED_RESOURCES)
 
 #define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
 #define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
 #define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
 #define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+#define hwloc_topology_components_flag_e HWLOC_NAME(hwloc_topology_components_flag_e)
+#define HWLOC_TOPOLOGY_COMPONENTS_FLAG_BLACKLIST HWLOC_NAME_CAPS(TOPOLOGY_COMPONENTS_FLAG_BLACKLIST)
+#define hwloc_topology_set_components HWLOC_NAME(topology_set_components)
 
 #define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
 #define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
@@ -131,19 +138,36 @@ extern "C" {
 #define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
 #define hwloc_topology_support HWLOC_NAME(topology_support)
 #define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
-#define hwloc_topology_ignore_type HWLOC_NAME(topology_ignore_type)
-#define hwloc_topology_ignore_type_keep_structure HWLOC_NAME(topology_ignore_type_keep_structure)
-#define hwloc_topology_ignore_all_keep_structure HWLOC_NAME(topology_ignore_all_keep_structure)
-#define hwloc_topology_set_distance_matrix HWLOC_NAME(topology_set_distance_matrix)
+
+#define hwloc_type_filter_e HWLOC_NAME(type_filter_e)
+#define HWLOC_TYPE_FILTER_KEEP_ALL HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_ALL)
+#define HWLOC_TYPE_FILTER_KEEP_NONE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_NONE)
+#define HWLOC_TYPE_FILTER_KEEP_STRUCTURE HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_STRUCTURE)
+#define HWLOC_TYPE_FILTER_KEEP_IMPORTANT HWLOC_NAME_CAPS(TYPE_FILTER_KEEP_IMPORTANT)
+#define hwloc_topology_set_type_filter HWLOC_NAME(topology_set_type_filter)
+#define hwloc_topology_get_type_filter HWLOC_NAME(topology_get_type_filter)
+#define hwloc_topology_set_all_types_filter HWLOC_NAME(topology_set_all_types_filter)
+#define hwloc_topology_set_cache_types_filter HWLOC_NAME(topology_set_cache_types_filter)
+#define hwloc_topology_set_icache_types_filter HWLOC_NAME(topology_set_icache_types_filter)
+#define hwloc_topology_set_io_types_filter HWLOC_NAME(topology_set_io_types_filter)
+
 #define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
 #define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
 
 #define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
-#define HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_DISTANCES)
+#define HWLOC_RESTRICT_FLAG_REMOVE_CPULESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_CPULESS)
+#define HWLOC_RESTRICT_FLAG_BYNODESET HWLOC_NAME_CAPS(RESTRICT_FLAG_BYNODESET)
+#define HWLOC_RESTRICT_FLAG_REMOVE_MEMLESS HWLOC_NAME_CAPS(RESTRICT_FLAG_REMOVE_MEMLESS)
 #define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
 #define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
 #define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
 
+#define hwloc_allow_flags_e HWLOC_NAME(allow_flags_e)
+#define HWLOC_ALLOW_FLAG_ALL HWLOC_NAME_CAPS(ALLOW_FLAG_ALL)
+#define HWLOC_ALLOW_FLAG_LOCAL_RESTRICTIONS HWLOC_NAME_CAPS(ALLOW_FLAG_LOCAL_RESTRICTIONS)
+#define HWLOC_ALLOW_FLAG_CUSTOM HWLOC_NAME_CAPS(ALLOW_FLAG_CUSTOM)
+#define hwloc_topology_allow HWLOC_NAME(topology_allow)
+
 #define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
 #define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
 #define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
@@ -151,6 +175,7 @@ extern "C" {
 
 #define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
 #define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+#define hwloc_get_memory_parents_depth HWLOC_NAME(get_memory_parents_depth)
 
 #define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
 #define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
@@ -159,6 +184,8 @@ extern "C" {
 #define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
 #define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
 #define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+#define HWLOC_TYPE_DEPTH_NUMANODE HWLOC_NAME_CAPS(TYPE_DEPTH_NUMANODE)
+#define HWLOC_TYPE_DEPTH_MEMCACHE HWLOC_NAME_CAPS(TYPE_DEPTH_MEMCACHE)
 
 #define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
 #define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
@@ -168,10 +195,11 @@ extern "C" {
 #define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
 
 #define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
-#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
 #define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
 #define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
-#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_type_sscanf HWLOC_NAME(type_sscanf)
+#define hwloc_type_sscanf_as_depth HWLOC_NAME(type_sscanf_as_depth)
+
 #define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
 #define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
 
@@ -196,7 +224,6 @@ extern "C" {
 #define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
 #define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
 #define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
-#define HWLOC_MEMBIND_REPLICATE HWLOC_NAME_CAPS(MEMBIND_REPLICATE)
 #define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
 #define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
 
@@ -207,22 +234,17 @@ extern "C" {
 #define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
 #define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
 #define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+#define HWLOC_MEMBIND_BYNODESET HWLOC_NAME_CAPS(MEMBIND_BYNODESET)
 
 #define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
 
-#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
 #define hwloc_set_membind HWLOC_NAME(set_membind)
-#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
 #define hwloc_get_membind HWLOC_NAME(get_membind)
-#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
 #define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
-#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
 #define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
-#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
 #define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
-#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
 #define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
-#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+#define hwloc_get_area_memlocation HWLOC_NAME(get_area_memlocation)
 #define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
 #define hwloc_alloc HWLOC_NAME(alloc)
 #define hwloc_free HWLOC_NAME(free)
@@ -234,7 +256,6 @@ extern "C" {
 #define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
 #define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
 #define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
-#define hwloc_get_hostbridge_by_pcibus HWLOC_NAME(get_hostbridge_by_pcibus)
 
 /* hwloc/bitmap.h */
 
@@ -259,10 +280,12 @@ extern "C" {
 #define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
 #define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
 #define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
-
+#define hwloc_bitmap_from_ulongs HWLOC_NAME(bitmap_from_ulongs)
 #define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
 #define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
 #define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_to_ulongs HWLOC_NAME(bitmap_to_ulongs)
+#define hwloc_bitmap_nr_ulongs HWLOC_NAME(bitmap_nr_ulongs)
 #define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
 #define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
 #define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
@@ -284,6 +307,9 @@ extern "C" {
 #define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
 #define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
 #define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_first_unset HWLOC_NAME(bitmap_first_unset)
+#define hwloc_bitmap_last_unset HWLOC_NAME(bitmap_last_unset)
+#define hwloc_bitmap_next_unset HWLOC_NAME(bitmap_next_unset)
 #define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
 #define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
 #define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
@@ -316,6 +342,12 @@ extern "C" {
 #define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
 #define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
 #define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_obj_type_is_normal HWLOC_NAME(obj_type_is_normal)
+#define hwloc_obj_type_is_memory HWLOC_NAME(obj_type_is_memory)
+#define hwloc_obj_type_is_io HWLOC_NAME(obj_type_is_io)
+#define hwloc_obj_type_is_cache HWLOC_NAME(obj_type_is_cache)
+#define hwloc_obj_type_is_dcache HWLOC_NAME(obj_type_is_dcache)
+#define hwloc_obj_type_is_icache HWLOC_NAME(obj_type_is_icache)
 #define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
 #define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
 #define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
@@ -334,16 +366,12 @@ extern "C" {
 #define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
 #define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
 #define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
-#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
 #define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
-#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
-#define hwloc_get_whole_distance_matrix_by_depth HWLOC_NAME(get_whole_distance_matrix_by_depth)
-#define hwloc_get_whole_distance_matrix_by_type HWLOC_NAME(get_whole_distance_matrix_by_type)
-#define hwloc_get_distance_matrix_covering_obj_by_depth HWLOC_NAME(get_distance_matrix_covering_obj_by_depth)
-#define hwloc_get_latency HWLOC_NAME(get_latency)
 
 /* export.h */
 
+#define hwloc_topology_export_xml_flags_e HWLOC_NAME(topology_export_xml_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_XML_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_XML_FLAG_V1)
 #define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
 #define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
 #define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
@@ -355,8 +383,40 @@ extern "C" {
 #define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
 #define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
 #define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1 HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_V1)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_IGNORE_MEMORY)
 #define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
 
+/* distances.h */
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+
+#define hwloc_distances_kind_e HWLOC_NAME(distances_kind_e)
+#define HWLOC_DISTANCES_KIND_FROM_OS HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_OS)
+#define HWLOC_DISTANCES_KIND_FROM_USER HWLOC_NAME_CAPS(DISTANCES_KIND_FROM_USER)
+#define HWLOC_DISTANCES_KIND_MEANS_LATENCY HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_LATENCY)
+#define HWLOC_DISTANCES_KIND_MEANS_BANDWIDTH HWLOC_NAME_CAPS(DISTANCES_KIND_MEANS_BANDWIDTH)
+#define HWLOC_DISTANCES_KIND_HETEROGENEOUS_TYPES HWLOC_NAME_CAPS(DISTANCES_KIND_HETEROGENEOUS_TYPES)
+
+#define hwloc_distances_get HWLOC_NAME(distances_get)
+#define hwloc_distances_get_by_depth HWLOC_NAME(distances_get_by_depth)
+#define hwloc_distances_get_by_type HWLOC_NAME(distances_get_by_type)
+#define hwloc_distances_get_by_name HWLOC_NAME(distances_get_by_name)
+#define hwloc_distances_get_name HWLOC_NAME(distances_get_name)
+#define hwloc_distances_release HWLOC_NAME(distances_release)
+#define hwloc_distances_obj_index HWLOC_NAME(distances_obj_index)
+#define hwloc_distances_obj_pair_values HWLOC_NAME(distances_pair_values)
+
+#define hwloc_distances_add_flag_e HWLOC_NAME(distances_add_flag_e)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP)
+#define HWLOC_DISTANCES_ADD_FLAG_GROUP_INACCURATE HWLOC_NAME_CAPS(DISTANCES_ADD_FLAG_GROUP_INACCURATE)
+
+#define hwloc_distances_add HWLOC_NAME(distances_add)
+#define hwloc_distances_remove HWLOC_NAME(distances_remove)
+#define hwloc_distances_remove_by_depth HWLOC_NAME(distances_remove_by_depth)
+#define hwloc_distances_remove_by_type HWLOC_NAME(distances_remove_by_type)
+#define hwloc_distances_release_remove HWLOC_NAME(distances_release_remove)
+
 /* diff.h */
 
 #define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
@@ -387,6 +447,12 @@ extern "C" {
 #define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
 #define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
 
+/* shmem.h */
+
+#define hwloc_shmem_topology_get_length HWLOC_NAME(shmem_topology_get_length)
+#define hwloc_shmem_topology_write HWLOC_NAME(shmem_topology_write)
+#define hwloc_shmem_topology_adopt HWLOC_NAME(shmem_topology_adopt)
+
 /* glibc-sched.h */
 
 #define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
@@ -405,10 +471,10 @@ extern "C" {
 
 /* linux.h */
 
-#define hwloc_linux_parse_cpumap_file HWLOC_NAME(linux_parse_cpumap_file)
 #define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
 #define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
 #define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+#define hwloc_linux_read_path_as_cpumask HWLOC_NAME(linux_read_file_cpumask)
 
 /* openfabrics-verbs.h */
 
@@ -416,11 +482,6 @@ extern "C" {
 #define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
 #define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
 
-/* myriexpress.h */
-
-#define hwloc_mx_board_get_device_cpuset HWLOC_NAME(mx_board_get_device_cpuset)
-#define hwloc_mx_endpoint_get_device_cpuset HWLOC_NAME(mx_endpoint_get_device_cpuset)
-
 /* intel-mic.h */
 
 #define hwloc_intel_mic_get_device_cpuset HWLOC_NAME(intel_mic_get_device_cpuset)
@@ -428,6 +489,8 @@ extern "C" {
 
 /* opencl.h */
 
+#define hwloc_cl_device_topology_amd HWLOC_NAME(cl_device_topology_amd)
+#define hwloc_opencl_get_device_pci_busid HWLOC_NAME(opencl_get_device_pci_ids)
 #define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
 #define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
 #define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
@@ -461,21 +524,26 @@ extern "C" {
 
 /* hwloc/plugins.h */
 
-#define hwloc_disc_component_type_e HWLOC_NAME(disc_component_type_e)
-#define HWLOC_DISC_COMPONENT_TYPE_CPU HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_CPU)
-#define HWLOC_DISC_COMPONENT_TYPE_GLOBAL HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_GLOBAL)
-#define HWLOC_DISC_COMPONENT_TYPE_MISC HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_MISC)
-#define hwloc_disc_component_type_t HWLOC_NAME(disc_component_type_t)
+#define hwloc_disc_phase_e HWLOC_NAME(disc_phase_e)
+#define HWLOC_DISC_PHASE_GLOBAL HWLOC_NAME_CAPS(DISC_PHASE_GLOBAL)
+#define HWLOC_DISC_PHASE_CPU HWLOC_NAME_CAPS(DISC_PHASE_CPU)
+#define HWLOC_DISC_PHASE_MEMORY HWLOC_NAME_CAPS(DISC_PHASE_MEMORY)
+#define HWLOC_DISC_PHASE_PCI HWLOC_NAME_CAPS(DISC_PHASE_PCI)
+#define HWLOC_DISC_PHASE_IO HWLOC_NAME_CAPS(DISC_PHASE_IO)
+#define HWLOC_DISC_PHASE_MISC HWLOC_NAME_CAPS(DISC_PHASE_MISC)
+#define HWLOC_DISC_PHASE_ANNOTATE HWLOC_NAME_CAPS(DISC_PHASE_ANNOTATE)
+#define HWLOC_DISC_PHASE_TWEAK HWLOC_NAME_CAPS(DISC_PHASE_TWEAK)
+#define hwloc_disc_phase_t HWLOC_NAME(disc_phase_t)
 #define hwloc_disc_component HWLOC_NAME(disc_component)
 
+#define hwloc_disc_status_flag_e HWLOC_NAME(disc_status_flag_e)
+#define HWLOC_DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES HWLOC_NAME_CAPS(DISC_STATUS_FLAG_GOT_ALLOWED_RESOURCES)
+#define hwloc_disc_status HWLOC_NAME(disc_status)
+
 #define hwloc_backend HWLOC_NAME(backend)
-#define hwloc_backend_flag_e HWLOC_NAME(backend_flag_e)
-#define HWLOC_BACKEND_FLAG_NEED_LEVELS HWLOC_NAME_CAPS(BACKEND_FLAG_NEED_LEVELS)
 
 #define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
 #define hwloc_backend_enable HWLOC_NAME(backend_enable)
-#define hwloc_backends_get_obj_cpuset HWLOC_NAME(backends_get_obj_cpuset)
-#define hwloc_backends_notify_new_object HWLOC_NAME(backends_notify_new_object)
 
 #define hwloc_component_type_e HWLOC_NAME(component_type_e)
 #define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
@@ -493,28 +561,49 @@ extern "C" {
 #define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
 #define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
 #define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+#define hwloc_topology_reconnect HWLOC_NAME(topology_reconnect)
+
+#define hwloc_filter_check_pcidev_subtype_important HWLOC_NAME(filter_check_pcidev_subtype_important)
+#define hwloc_filter_check_osdev_subtype_important HWLOC_NAME(filter_check_osdev_subtype_important)
+#define hwloc_filter_check_keep_object_type HWLOC_NAME(filter_check_keep_object_type)
+#define hwloc_filter_check_keep_object HWLOC_NAME(filter_check_keep_object)
 
-#define hwloc_insert_pci_device_list HWLOC_NAME(insert_pci_device_list)
-#define hwloc_pci_find_cap HWLOC_NAME(pci_find_cap)
-#define hwloc_pci_find_linkspeed HWLOC_NAME(pci_find_linkspeed)
-#define hwloc_pci_prepare_bridge HWLOC_NAME(pci_prepare_bridge)
+#define hwloc_pcidisc_find_cap HWLOC_NAME(pcidisc_find_cap)
+#define hwloc_pcidisc_find_linkspeed HWLOC_NAME(pcidisc_find_linkspeed)
+#define hwloc_pcidisc_check_bridge_type HWLOC_NAME(pcidisc_check_bridge_type)
+#define hwloc_pcidisc_find_bridge_buses HWLOC_NAME(pcidisc_find_bridge_buses)
+#define hwloc_pcidisc_tree_insert_by_busid HWLOC_NAME(pcidisc_tree_insert_by_busid)
+#define hwloc_pcidisc_tree_attach HWLOC_NAME(pcidisc_tree_attach)
+
+#define hwloc_pci_find_parent_by_busid HWLOC_NAME(pcidisc_find_busid_parent)
 
 /* hwloc/deprecated.h */
 
-#define hwloc_obj_type_of_string HWLOC_NAME(obj_type_of_string )
-#define hwloc_obj_snprintf HWLOC_NAME(obj_snprintf)
-#define hwloc_distributev HWLOC_NAME(distributev)
-#define hwloc_distribute HWLOC_NAME(distribute)
 #define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
 
 /* private/debug.h */
 
+#define hwloc_debug_enabled HWLOC_NAME(debug_enabled)
 #define hwloc_debug HWLOC_NAME(debug)
 
 /* private/misc.h */
 
+#ifndef HWLOC_HAVE_CORRECT_SNPRINTF
 #define hwloc_snprintf HWLOC_NAME(snprintf)
-#define hwloc_namecoloncmp HWLOC_NAME(namecoloncmp)
+#endif
 #define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
 #define hwloc_ffs32 HWLOC_NAME(ffs32)
 #define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
@@ -524,6 +613,21 @@ extern "C" {
 #define hwloc_weight_long HWLOC_NAME(weight_long)
 #define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
 
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+#define hwloc_pci_class_string HWLOC_NAME(pci_class_string)
+#define hwloc_linux_pci_link_speed_from_string HWLOC_NAME(linux_pci_link_speed_from_string)
+
+#define hwloc_cache_type_by_depth_type HWLOC_NAME(cache_type_by_depth_type)
+#define hwloc__obj_type_is_normal HWLOC_NAME(_obj_type_is_normal)
+#define hwloc__obj_type_is_memory HWLOC_NAME(_obj_type_is_memory)
+#define hwloc__obj_type_is_io HWLOC_NAME(_obj_type_is_io)
+#define hwloc__obj_type_is_special HWLOC_NAME(_obj_type_is_special)
+
+#define hwloc__obj_type_is_cache HWLOC_NAME(_obj_type_is_cache)
+#define hwloc__obj_type_is_dcache HWLOC_NAME(_obj_type_is_dcache)
+#define hwloc__obj_type_is_icache HWLOC_NAME(_obj_type_is_icache)
+
 /* private/cpuid-x86.h */
 
 #define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
@@ -539,7 +643,8 @@ extern "C" {
 #define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
 #define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
 #define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
-#define hwloc__xml_export_object HWLOC_NAME(_xml_export_object)
+#define hwloc__xml_export_data_s HWLOC_NAME(_xml_export_data_s)
+#define hwloc__xml_export_topology HWLOC_NAME(_xml_export_topology)
 #define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
 
 #define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
@@ -547,36 +652,63 @@ extern "C" {
 #define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
 #define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
 
+#define hwloc__xml_imported_v1distances_s HWLOC_NAME(_xml_imported_v1distances_s)
+
 /* private/components.h */
 
 #define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
 #define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
 
-#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
 #define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+#define hwloc_backends_find_callbacks HWLOC_NAME(backends_find_callbacks)
+
+#define hwloc_topology_components_init HWLOC_NAME(topology_components_init)
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+#define hwloc_topology_components_fini HWLOC_NAME(topology_components_fini)
 
 #define hwloc_components_init HWLOC_NAME(components_init)
-#define hwloc_components_destroy_all HWLOC_NAME(components_destroy_all)
+#define hwloc_components_fini HWLOC_NAME(components_fini)
 
-/* private/private.h */
+/* private/internal-private.h */
+
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_synthetic_component HWLOC_NAME(synthetic_component)
+
+#define hwloc_aix_component HWLOC_NAME(aix_component)
+#define hwloc_bgq_component HWLOC_NAME(bgq_component)
+#define hwloc_darwin_component HWLOC_NAME(darwin_component)
+#define hwloc_freebsd_component HWLOC_NAME(freebsd_component)
+#define hwloc_hpux_component HWLOC_NAME(hpux_component)
+#define hwloc_linux_component HWLOC_NAME(linux_component)
+#define hwloc_netbsd_component HWLOC_NAME(netbsd_component)
+#define hwloc_noos_component HWLOC_NAME(noos_component)
+#define hwloc_solaris_component HWLOC_NAME(solaris_component)
+#define hwloc_windows_component HWLOC_NAME(windows_component)
+#define hwloc_x86_component HWLOC_NAME(x86_component)
+
+#define hwloc_cuda_component HWLOC_NAME(cuda_component)
+#define hwloc_gl_component HWLOC_NAME(gl_component)
+#define hwloc_nvml_component HWLOC_NAME(nvml_component)
+#define hwloc_opencl_component HWLOC_NAME(opencl_component)
+#define hwloc_pci_component HWLOC_NAME(pci_component)
+
+#define hwloc_xml_libxml_component HWLOC_NAME(xml_libxml_component)
+#define hwloc_xml_nolibxml_component HWLOC_NAME(xml_nolibxml_component)
 
-#define hwloc_ignore_type_e HWLOC_NAME(ignore_type_e)
+/* private/private.h */
 
-#define HWLOC_IGNORE_TYPE_NEVER HWLOC_NAME_CAPS(IGNORE_TYPE_NEVER)
-#define HWLOC_IGNORE_TYPE_KEEP_STRUCTURE HWLOC_NAME_CAPS(IGNORE_TYPE_KEEP_STRUCTURE)
-#define HWLOC_IGNORE_TYPE_ALWAYS HWLOC_NAME_CAPS(IGNORE_TYPE_ALWAYS)
+#define hwloc_special_level_s HWLOC_NAME(special_level_s)
 
-#define hwloc_os_distances_s HWLOC_NAME(os_distances_s)
+#define hwloc_pci_forced_locality_s HWLOC_NAME(pci_forced_locality_s)
+#define hwloc_pci_locality_s HWLOC_NAME(pci_locality_s)
 
-#define hwloc_xml_imported_distances_s HWLOC_NAME(xml_imported_distances_s)
+#define hwloc_topology_forced_component_s HWLOC_NAME(topology_forced_component)
 
-#define hwloc_alloc_obj_cpusets HWLOC_NAME(alloc_obj_cpusets)
+#define hwloc_alloc_root_sets HWLOC_NAME(alloc_root_sets)
 #define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
 #define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
 #define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
 #define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
-#define hwloc_connect_children HWLOC_NAME(connect_children)
-#define hwloc_connect_levels HWLOC_NAME(connect_levels)
 
 #define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
 #define hwloc__reorder_children HWLOC_NAME(_reorder_children)
@@ -584,8 +716,16 @@ extern "C" {
 #define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
 #define hwloc_topology_clear HWLOC_NAME(topology_clear)
 
+#define hwloc__attach_memory_object HWLOC_NAME(insert_memory_object)
+
+#define hwloc_pci_discovery_init HWLOC_NAME(pci_discovery_init)
+#define hwloc_pci_discovery_prepare HWLOC_NAME(pci_discovery_prepare)
+#define hwloc_pci_discovery_exit HWLOC_NAME(pci_discovery_exit)
+#define hwloc_pci_find_by_busid HWLOC_NAME(pcidisc_find_by_busid)
+#define hwloc_find_insert_io_parent_by_complete_cpuset HWLOC_NAME(hwloc_find_insert_io_parent_by_complete_cpuset)
+
 #define hwloc__add_info HWLOC_NAME(_add_info)
-#define hwloc__find_info_slot HWLOC_NAME(_find_info_slot)
+#define hwloc__add_info_nodup HWLOC_NAME(_add_info_nodup)
 #define hwloc__move_infos HWLOC_NAME(_move_infos)
 #define hwloc__free_infos HWLOC_NAME(_free_infos)
 
@@ -597,16 +737,20 @@ extern "C" {
 #define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
 #define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
 #define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
-#define hwloc_set_osf_hooks HWLOC_NAME(set_osf_hooks)
 #define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
 #define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
 #define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
 #define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
 #define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
 
+#define hwloc_look_hardwired_fujitsu_k HWLOC_NAME(look_hardwired_fujitsu_k)
+#define hwloc_look_hardwired_fujitsu_fx10 HWLOC_NAME(look_hardwired_fujitsu_fx10)
+#define hwloc_look_hardwired_fujitsu_fx100 HWLOC_NAME(look_hardwired_fujitsu_fx100)
+
 #define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
 #define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
-#define hwloc__duplicate_objects HWLOC_NAME(_duplicate_objects)
+#define hwloc_free_object_and_children HWLOC_NAME(free_object_and_children)
+#define hwloc_free_object_siblings_and_children HWLOC_NAME(free_object_siblings_and_children)
 
 #define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
 #define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
@@ -614,31 +758,35 @@ extern "C" {
 #define hwloc_free_mmap HWLOC_NAME(free_mmap)
 #define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
 
-#define hwloc_distances_init HWLOC_NAME(distances_init)
-#define hwloc_distances_destroy HWLOC_NAME(distances_destroy)
-#define hwloc_distances_set HWLOC_NAME(distances_set)
-#define hwloc_distances_set_from_env HWLOC_NAME(distances_set_from_env)
-#define hwloc_distances_restrict_os HWLOC_NAME(distances_restrict_os)
-#define hwloc_distances_restrict HWLOC_NAME(distances_restrict)
-#define hwloc_distances_finalize_os HWLOC_NAME(distances_finalize_os)
-#define hwloc_distances_finalize_logical HWLOC_NAME(distances_finalize_logical)
-#define hwloc_clear_object_distances HWLOC_NAME(clear_object_distances)
-#define hwloc_clear_object_distances_one HWLOC_NAME(clear_object_distances_one)
-#define hwloc_group_by_distances HWLOC_NAME(group_by_distances)
+#define hwloc_internal_distances_s HWLOC_NAME(internal_distances_s)
+#define hwloc_internal_distances_init HWLOC_NAME(internal_distances_init)
+#define hwloc_internal_distances_prepare HWLOC_NAME(internal_distances_prepare)
+#define hwloc_internal_distances_dup HWLOC_NAME(internal_distances_dup)
+#define hwloc_internal_distances_refresh HWLOC_NAME(internal_distances_refresh)
+#define hwloc_internal_distances_destroy HWLOC_NAME(internal_distances_destroy)
+
+#define hwloc_internal_distances_add HWLOC_NAME(internal_distances_add)
+#define hwloc_internal_distances_add_by_index HWLOC_NAME(internal_distances_add_by_index)
+#define hwloc_internal_distances_invalidate_cached_objs HWLOC_NAME(hwloc_internal_distances_invalidate_cached_objs)
 
 #define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
 #define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
 
-#define hwloc_obj_add_info_nodup HWLOC_NAME(obj_add_info_nodup)
-
 #define hwloc_progname HWLOC_NAME(progname)
 
-#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+#define hwloc__topology_disadopt HWLOC_NAME(_topology_disadopt)
+#define hwloc__topology_dup HWLOC_NAME(_topology_dup)
+
+#define hwloc_tma HWLOC_NAME(tma)
+#define hwloc_tma_malloc HWLOC_NAME(tma_malloc)
+#define hwloc_tma_calloc HWLOC_NAME(tma_calloc)
+#define hwloc_tma_strdup HWLOC_NAME(tma_strdup)
+#define hwloc_bitmap_tma_dup HWLOC_NAME(bitmap_tma_dup)
 
 /* private/solaris-chiptype.h */
 
-#define hwloc_solaris_get_chip_type HWLOC_NAME(solaris_get_chip_type)
-#define hwloc_solaris_get_chip_model HWLOC_NAME(solaris_get_chip_model)
+#define hwloc_solaris_chip_info_s HWLOC_NAME(solaris_chip_info_s)
+#define hwloc_solaris_get_chip_info HWLOC_NAME(solaris_get_chip_info)
 
 #endif /* HWLOC_SYM_TRANSFORM */
 
diff --git a/ext/hwloc/include/hwloc/shmem.h b/ext/hwloc/include/hwloc/shmem.h
new file mode 100644
index 000000000..86f57b4f6
--- /dev/null
+++ b/ext/hwloc/include/hwloc/shmem.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2013-2018 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Sharing topologies between processes
+ */
+
+#ifndef HWLOC_SHMEM_H
+#define HWLOC_SHMEM_H
+
+#include "hwloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_shmem Sharing topologies between processes
+ *
+ * These functions are used to share a topology between processes by
+ * duplicating it into a file-backed shared-memory buffer.
+ *
+ * The master process must first get the required shared-memory size
+ * for storing this topology with hwloc_shmem_topology_get_length().
+ *
+ * Then it must find a virtual memory area of that size that is available
+ * in all processes (identical virtual addresses in all processes).
+ * On Linux, this can be done by comparing holes found in /proc/\<pid\>/maps
+ * for each process.
+ *
+ * Once found, it must open a destination file for storing the buffer,
+ * and pass it to hwloc_shmem_topology_write() together with
+ * virtual memory address and length obtained above.
+ *
+ * Other processes may then adopt this shared topology by opening the
+ * same file and passing it to hwloc_shmem_topology_adopt() with the
+ * exact same virtual memory address and length.
+ *
+ * @{
+ */
+
+/** \brief Get the required shared memory length for storing a topology.
+ *
+ * This length (in bytes) must be used in hwloc_shmem_topology_write()
+ * and hwloc_shmem_topology_adopt() later.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_get_length(hwloc_topology_t topology,
+						   size_t *lengthp,
+						   unsigned long flags);
+
+/** \brief Duplicate a topology to a shared memory file.
+ *
+ * Temporarily map a file in virtual memory and duplicate the
+ * topology \p topology by allocating duplicates in there.
+ *
+ * The segment of the file pointed by descriptor \p fd,
+ * starting at offset \p fileoffset, and of length \p length (in bytes),
+ * will be temporarily mapped at virtual address \p mmap_address
+ * during the duplication.
+ *
+ * The mapping length \p length must have been previously obtained with
+ * hwloc_shmem_topology_get_length()
+ * and the topology must not have been modified in the meantime.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ *
+ * \note The object userdata pointer is duplicated but the pointed buffer
+ * is not. However the caller may also allocate it manually in shared memory
+ * to share it as well.
+ *
+ * \return -1 with errno set to EBUSY if the virtual memory mapping defined
+ * by \p mmap_address and \p length isn't available in the process.
+ * \return -1 with errno set to EINVAL if \p fileoffset, \p mmap_address
+ * or \p length aren't page-aligned.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_write(hwloc_topology_t topology,
+					      int fd, hwloc_uint64_t fileoffset,
+					      void *mmap_address, size_t length,
+					      unsigned long flags);
+
+/** \brief Adopt a shared memory topology stored in a file.
+ *
+ * Map a file in virtual memory and adopt the topology that was previously
+ * stored there with hwloc_shmem_topology_write().
+ *
+ * The returned adopted topology in \p topologyp can be used just like any
+ * topology. And it must be destroyed with hwloc_topology_destroy() as usual.
+ *
+ * However the topology is read-only.
+ * For instance, it cannot be modified with hwloc_topology_restrict()
+ * and object userdata pointers cannot be changed.
+ *
+ * The segment of the file pointed by descriptor \p fd,
+ * starting at offset \p fileoffset, and of length \p length (in bytes),
+ * will be mapped at virtual address \p mmap_address.
+ *
+ * The file pointed by descriptor \p fd, the offset \p fileoffset,
+ * the requested mapping virtual address \p mmap_address and the length \p length
+ * must be identical to what was given to hwloc_shmem_topology_write() earlier.
+ *
+ * \note Flags \p flags are currently unused, must be 0.
+ *
+ * \note The object userdata pointer should not be used unless the process
+ * that created the shared topology also placed userdata-pointed buffers
+ * in shared memory.
+ *
+ * \note This function takes care of calling hwloc_topology_abi_check().
+ *
+ * \return -1 with errno set to EBUSY if the virtual memory mapping defined
+ * by \p mmap_address and \p length isn't available in the process.
+ *
+ * \return -1 with errno set to EINVAL if \p fileoffset, \p mmap_address
+ * or \p length aren't page-aligned, or do not match what was given to
+ * hwloc_shmem_topology_write() earlier.
+ *
+ * \return -1 with errno set to EINVAL if the layout of the topology structure
+ * is different between the writer process and the adopter process.
+ */
+HWLOC_DECLSPEC int hwloc_shmem_topology_adopt(hwloc_topology_t *topologyp,
+					      int fd, hwloc_uint64_t fileoffset,
+					      void *mmap_address, size_t length,
+					      unsigned long flags);
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_SHMEM_H */
diff --git a/ext/hwloc/include/numa.h b/ext/hwloc/include/numa.h
deleted file mode 100644
index 1dbc13728..000000000
--- a/ext/hwloc/include/numa.h
+++ /dev/null
@@ -1,468 +0,0 @@
-/* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs.
-
-   libnuma is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; version
-   2.1.
-
-   libnuma is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should find a copy of v2.1 of the GNU Lesser General Public License
-   somewhere on your Linux system; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
-
-#ifndef _NUMA_H
-#define _NUMA_H 1
-
-/* allow an application to test for the current programming interface: */
-#define LIBNUMA_API_VERSION 2
-
-/* Simple NUMA policy library */
-
-#include <stddef.h>
-#include <string.h>
-#include <sys/types.h>
-#include <stdlib.h>
-
-#if defined(__x86_64__) || defined(__i386__)
-#define NUMA_NUM_NODES  128
-#else
-#define NUMA_NUM_NODES  2048
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-        unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
-} nodemask_t;
-
-struct bitmask {
-	unsigned long size; /* number of bits in the map */
-	unsigned long *maskp;
-};
-
-/* operations on struct bitmask */
-int numa_bitmask_isbitset(const struct bitmask *, unsigned int);
-struct bitmask *numa_bitmask_setall(struct bitmask *);
-struct bitmask *numa_bitmask_clearall(struct bitmask *);
-struct bitmask *numa_bitmask_setbit(struct bitmask *, unsigned int);
-struct bitmask *numa_bitmask_clearbit(struct bitmask *, unsigned int);
-unsigned int numa_bitmask_nbytes(struct bitmask *);
-struct bitmask *numa_bitmask_alloc(unsigned int);
-void numa_bitmask_free(struct bitmask *);
-int numa_bitmask_equal(const struct bitmask *, const struct bitmask *);
-void copy_nodemask_to_bitmask(nodemask_t *, struct bitmask *);
-void copy_bitmask_to_nodemask(struct bitmask *, nodemask_t *);
-void copy_bitmask_to_bitmask(struct bitmask *, struct bitmask *);
-
-/* compatibility for codes that used them: */
-
-static inline void nodemask_zero(nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_bitmask_clearall(&tmp);
-}
-
-static inline void nodemask_zero_compat(nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_bitmask_clearall(&tmp);
-}
-
-static inline void nodemask_set_compat(nodemask_t *mask, int node)
-{
-	mask->n[node / (8*sizeof(unsigned long))] |=
-		(1UL<<(node%(8*sizeof(unsigned long))));
-}
-
-static inline void nodemask_clr_compat(nodemask_t *mask, int node)
-{
-	mask->n[node / (8*sizeof(unsigned long))] &=
-		~(1UL<<(node%(8*sizeof(unsigned long))));
-}
-
-static inline int nodemask_isset_compat(const nodemask_t *mask, int node)
-{
-	if ((unsigned)node >= NUMA_NUM_NODES)
-		return 0;
-	if (mask->n[node / (8*sizeof(unsigned long))] &
-		(1UL<<(node%(8*sizeof(unsigned long)))))
-		return 1;
-	return 0;
-}
-
-static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b)
-{
-	struct bitmask tmp_a, tmp_b;
-
-	tmp_a.maskp = (unsigned long *)a;
-	tmp_a.size = sizeof(nodemask_t) * 8;
-
-	tmp_b.maskp = (unsigned long *)b;
-	tmp_b.size = sizeof(nodemask_t) * 8;
-
-	return numa_bitmask_equal(&tmp_a, &tmp_b);
-}
-
-static inline int nodemask_equal_compat(const nodemask_t *a, const nodemask_t *b)
-{
-	struct bitmask tmp_a, tmp_b;
-
-	tmp_a.maskp = (unsigned long *)a;
-	tmp_a.size = sizeof(nodemask_t) * 8;
-
-	tmp_b.maskp = (unsigned long *)b;
-	tmp_b.size = sizeof(nodemask_t) * 8;
-
-	return numa_bitmask_equal(&tmp_a, &tmp_b);
-}
-
-/* NUMA support available. If this returns a negative value all other function
-   in this library are undefined. */
-int numa_available(void);
-
-/* Basic NUMA state */
-
-/* Get max available node */
-int numa_max_node(void);
-int numa_max_possible_node(void);
-/* Return preferred node */
-int numa_preferred(void);
-
-/* Return node size and free memory */
-long long numa_node_size64(int node, long long *freep);
-long numa_node_size(int node, long *freep);
-
-int numa_pagesize(void);
-
-/* Set with all nodes from which the calling process may allocate memory.
-   Only valid after numa_available. */
-extern struct bitmask *numa_all_nodes_ptr;
-
-/* Set with all nodes the kernel has exposed to userspace */
-extern struct bitmask *numa_nodes_ptr;
-
-/* For source compatibility */
-extern nodemask_t numa_all_nodes;
-
-/* Set with all cpus. */
-extern struct bitmask *numa_all_cpus_ptr;
-
-/* Set with no nodes */
-extern struct bitmask *numa_no_nodes_ptr;
-
-/* Source compatibility */
-extern nodemask_t numa_no_nodes;
-
-/* Only run and allocate memory from a specific set of nodes. */
-void numa_bind(struct bitmask *nodes);
-
-/* Set the NUMA node interleaving mask. 0 to turn off interleaving */
-void numa_set_interleave_mask(struct bitmask *nodemask);
-
-/* Return the current interleaving mask */
-struct bitmask *numa_get_interleave_mask(void);
-
-/* allocate a bitmask big enough for all nodes */
-struct bitmask *numa_allocate_nodemask(void);
-
-static inline void numa_free_nodemask(struct bitmask *b)
-{
-	numa_bitmask_free(b);
-}
-
-/* Some node to preferably allocate memory from for task. */
-void numa_set_preferred(int node);
-
-/* Set local memory allocation policy for task */
-void numa_set_localalloc(void);
-
-/* Only allocate memory from the nodes set in mask. 0 to turn off */
-void numa_set_membind(struct bitmask *nodemask);
-
-/* Return current membind */
-struct bitmask *numa_get_membind(void);
-
-/* Return allowed memories [nodes] */
-struct bitmask *numa_get_mems_allowed(void);
-
-int numa_get_interleave_node(void);
-
-/* NUMA memory allocation. These functions always round to page size
-   and are relatively slow. */
-
-/* Alloc memory page interleaved on nodes in mask */
-void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask);
-/* Alloc memory page interleaved on all nodes. */
-void *numa_alloc_interleaved(size_t size);
-/* Alloc memory located on node */
-void *numa_alloc_onnode(size_t size, int node);
-/* Alloc memory on local node */
-void *numa_alloc_local(size_t size);
-/* Allocation with current policy */
-void *numa_alloc(size_t size);
-/* Change the size of a memory area preserving the memory policy */
-void *numa_realloc(void *old_addr, size_t old_size, size_t new_size);
-/* Free memory allocated by the functions above */
-void numa_free(void *mem, size_t size);
-
-/* Low level functions, primarily for shared memory. All memory
-   processed by these must not be touched yet */
-
-/* Interleave an memory area. */
-void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask);
-
-/* Allocate a memory area on a specific node. */
-void numa_tonode_memory(void *start, size_t size, int node);
-
-/* Allocate memory on a mask of nodes. */
-void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask);
-
-/* Allocate a memory area on the current node. */
-void numa_setlocal_memory(void *start, size_t size);
-
-/* Allocate memory area with current memory policy */
-void numa_police_memory(void *start, size_t size);
-
-/* Run current task only on nodes in mask */
-int numa_run_on_node_mask(struct bitmask *mask);
-/* Run current task only on node */
-int numa_run_on_node(int node);
-/* Return current mask of nodes the task can run on */
-struct bitmask * numa_get_run_node_mask(void);
-
-/* When strict fail allocation when memory cannot be allocated in target node(s). */
-void numa_set_bind_policy(int strict);
-
-/* Fail when existing memory has incompatible policy */
-void numa_set_strict(int flag);
-
-/* maximum nodes (size of kernel nodemask_t) */
-int numa_num_possible_nodes();
-
-/* maximum cpus (size of kernel cpumask_t) */
-int numa_num_possible_cpus();
-
-/* nodes in the system */
-int numa_num_configured_nodes();
-
-/* maximum cpus */
-int numa_num_configured_cpus();
-
-/* maximum cpus allowed to current task */
-int numa_num_task_cpus();
-int numa_num_thread_cpus(); /* backward compatibility */
-
-/* maximum nodes allowed to current task */
-int numa_num_task_nodes();
-int numa_num_thread_nodes(); /* backward compatibility */
-
-/* allocate a bitmask the size of the kernel cpumask_t */
-struct bitmask *numa_allocate_cpumask();
-
-static inline void numa_free_cpumask(struct bitmask *b)
-{
-	numa_bitmask_free(b);
-}
-
-/* Convert node to CPU mask. -1/errno on failure, otherwise 0. */
-int numa_node_to_cpus(int, struct bitmask *);
-
-/* report the node of the specified cpu. -1/errno on invalid cpu. */
-int numa_node_of_cpu(int cpu);
-
-/* Report distance of node1 from node2. 0 on error.*/
-int numa_distance(int node1, int node2);
-
-/* Error handling. */
-/* This is an internal function in libnuma that can be overwritten by an user
-   program. Default is to print an error to stderr and exit if numa_exit_on_error
-   is true. */
-void numa_error(char *where);
-
-/* When true exit the program when a NUMA system call (except numa_available)
-   fails */
-extern int numa_exit_on_error;
-/* Warning function. Can also be overwritten. Default is to print on stderr
-   once. */
-void numa_warn(int num, char *fmt, ...);
-
-/* When true exit the program on a numa_warn() call */
-extern int numa_exit_on_warn;
-
-int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to);
-
-int numa_move_pages(int pid, unsigned long count, void **pages,
-		const int *nodes, int *status, int flags);
-
-int numa_sched_getaffinity(pid_t, struct bitmask *);
-int numa_sched_setaffinity(pid_t, struct bitmask *);
-
-/* Convert an ascii list of nodes to a bitmask */
-struct bitmask *numa_parse_nodestring(char *);
-
-/* Convert an ascii list of cpu to a bitmask */
-struct bitmask *numa_parse_cpustring(char *);
-
-/*
- * The following functions are for source code compatibility
- * with releases prior to version 2.
- * Such codes should be compiled with NUMA_VERSION1_COMPATIBILITY defined.
- */
-
-static inline void numa_set_interleave_mask_compat(nodemask_t *nodemask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)nodemask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_set_interleave_mask(&tmp);
-}
-
-static inline nodemask_t numa_get_interleave_mask_compat()
-{
-	struct bitmask *tp;
-	nodemask_t mask;
-
-	tp = numa_get_interleave_mask();
-	copy_bitmask_to_nodemask(tp, &mask);
-	numa_bitmask_free(tp);
-	return mask;
-}
-
-static inline void numa_bind_compat(nodemask_t *mask)
-{
-	struct bitmask *tp;
-
-	tp = numa_allocate_nodemask();
-	copy_nodemask_to_bitmask(mask, tp);
-	numa_bind(tp);
-	numa_bitmask_free(tp);
-}
-
-static inline void numa_set_membind_compat(nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_set_membind(&tmp);
-}
-
-static inline nodemask_t numa_get_membind_compat()
-{
-	struct bitmask *tp;
-	nodemask_t mask;
-
-	tp = numa_get_membind();
-	copy_bitmask_to_nodemask(tp, &mask);
-	numa_bitmask_free(tp);
-	return mask;
-}
-
-static inline void *numa_alloc_interleaved_subset_compat(size_t size,
-					const nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	return numa_alloc_interleaved_subset(size, &tmp);
-}
-
-static inline int numa_run_on_node_mask_compat(const nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	return numa_run_on_node_mask(&tmp);
-}
-
-static inline nodemask_t numa_get_run_node_mask_compat()
-{
-	struct bitmask *tp;
-	nodemask_t mask;
-
-	tp = numa_get_run_node_mask();
-	copy_bitmask_to_nodemask(tp, &mask);
-	numa_bitmask_free(tp);
-	return mask;
-}
-
-static inline void numa_interleave_memory_compat(void *mem, size_t size,
-						const nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_interleave_memory(mem, size, &tmp);
-}
-
-static inline void numa_tonodemask_memory_compat(void *mem, size_t size,
-						const nodemask_t *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = sizeof(nodemask_t) * 8;
-	numa_tonodemask_memory(mem, size, &tmp);
-}
-
-static inline int numa_sched_getaffinity_compat(pid_t pid, unsigned len,
-						unsigned long *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = len * 8;
-	return numa_sched_getaffinity(pid, &tmp);
-}
-
-static inline int numa_sched_setaffinity_compat(pid_t pid, unsigned len,
-						unsigned long *mask)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)mask;
-	tmp.size = len * 8;
-	return numa_sched_setaffinity(pid, &tmp);
-}
-
-static inline int numa_node_to_cpus_compat(int node, unsigned long *buffer,
-							int buffer_len)
-{
-	struct bitmask tmp;
-
-	tmp.maskp = (unsigned long *)buffer;
-	tmp.size = buffer_len * 8;
-	return numa_node_to_cpus(node, &tmp);
-}
-
-/* end of version 1 compatibility functions */
-
-/*
- * To compile an application that uses libnuma version 1:
- *   add -DNUMA_VERSION1_COMPATIBILITY to your Makefile's CFLAGS
- */
-#ifdef NUMA_VERSION1_COMPATIBILITY
-#include <numacompat1.h>
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/ext/hwloc/include/pci/config.h b/ext/hwloc/include/pci/config.h
deleted file mode 100644
index beecb1d1b..000000000
--- a/ext/hwloc/include/pci/config.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#define PCI_CONFIG_H
-#define PCI_ARCH_X86_64
-#define PCI_OS_LINUX
-#define PCI_HAVE_PM_LINUX_SYSFS
-#define PCI_HAVE_PM_LINUX_PROC
-#define PCI_HAVE_LINUX_BYTEORDER_H
-#define PCI_PATH_PROC_BUS_PCI "/proc/bus/pci"
-#define PCI_PATH_SYS_BUS_PCI "/sys/bus/pci"
-#define PCI_HAVE_PM_INTEL_CONF
-#define PCI_HAVE_64BIT_ADDRESS
-#define PCI_HAVE_PM_DUMP
-#define PCI_COMPRESSED_IDS
-#define PCI_IDS "pci.ids.gz"
-#define PCI_PATH_IDS_DIR "/usr/share/misc"
-#define PCI_USE_DNS
-#define PCI_ID_DOMAIN "pci.id.ucw.cz"
diff --git a/ext/hwloc/include/pci/header.h b/ext/hwloc/include/pci/header.h
deleted file mode 100644
index d481f2769..000000000
--- a/ext/hwloc/include/pci/header.h
+++ /dev/null
@@ -1,1195 +0,0 @@
-/*
- *	The PCI Library -- PCI Header Structure (based on <linux/pci.h>)
- *
- *	Copyright (c) 1997--2010 Martin Mares <mj@ucw.cz>
- *
- *	Can be freely distributed and used under the terms of the GNU GPL.
- */
-
-/*
- * Under PCI, each device has 256 bytes of configuration address space,
- * of which the first 64 bytes are standardized as follows:
- */
-#define PCI_VENDOR_ID		0x00	/* 16 bits */
-#define PCI_DEVICE_ID		0x02	/* 16 bits */
-#define PCI_COMMAND		0x04	/* 16 bits */
-#define  PCI_COMMAND_IO		0x1	/* Enable response in I/O space */
-#define  PCI_COMMAND_MEMORY	0x2	/* Enable response in Memory space */
-#define  PCI_COMMAND_MASTER	0x4	/* Enable bus mastering */
-#define  PCI_COMMAND_SPECIAL	0x8	/* Enable response to special cycles */
-#define  PCI_COMMAND_INVALIDATE	0x10	/* Use memory write and invalidate */
-#define  PCI_COMMAND_VGA_PALETTE 0x20	/* Enable palette snooping */
-#define  PCI_COMMAND_PARITY	0x40	/* Enable parity checking */
-#define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
-#define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
-#define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
-#define  PCI_COMMAND_DISABLE_INTx	0x400	/* PCIE: Disable INTx interrupts */
-
-#define PCI_STATUS		0x06	/* 16 bits */
-#define  PCI_STATUS_INTx	0x08	/* PCIE: INTx interrupt pending */
-#define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
-#define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
-#define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
-#define  PCI_STATUS_FAST_BACK	0x80	/* Accept fast-back to back */
-#define  PCI_STATUS_PARITY	0x100	/* Detected parity error */
-#define  PCI_STATUS_DEVSEL_MASK	0x600	/* DEVSEL timing */
-#define  PCI_STATUS_DEVSEL_FAST	0x000
-#define  PCI_STATUS_DEVSEL_MEDIUM 0x200
-#define  PCI_STATUS_DEVSEL_SLOW 0x400
-#define  PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */
-#define  PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */
-#define  PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */
-#define  PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */
-#define  PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */
-
-#define PCI_CLASS_REVISION	0x08	/* High 24 bits are class, low 8
-					   revision */
-#define PCI_REVISION_ID         0x08    /* Revision ID */
-#define PCI_CLASS_PROG          0x09    /* Reg. Level Programming Interface */
-#define PCI_CLASS_DEVICE        0x0a    /* Device class */
-
-#define PCI_CACHE_LINE_SIZE	0x0c	/* 8 bits */
-#define PCI_LATENCY_TIMER	0x0d	/* 8 bits */
-#define PCI_HEADER_TYPE		0x0e	/* 8 bits */
-#define  PCI_HEADER_TYPE_NORMAL	0
-#define  PCI_HEADER_TYPE_BRIDGE 1
-#define  PCI_HEADER_TYPE_CARDBUS 2
-
-#define PCI_BIST		0x0f	/* 8 bits */
-#define PCI_BIST_CODE_MASK	0x0f	/* Return result */
-#define PCI_BIST_START		0x40	/* 1 to start BIST, 2 secs or less */
-#define PCI_BIST_CAPABLE	0x80	/* 1 if BIST capable */
-
-/*
- * Base addresses specify locations in memory or I/O space.
- * Decoded size can be determined by writing a value of
- * 0xffffffff to the register, and reading it back.  Only
- * 1 bits are decoded.
- */
-#define PCI_BASE_ADDRESS_0	0x10	/* 32 bits */
-#define PCI_BASE_ADDRESS_1	0x14	/* 32 bits [htype 0,1 only] */
-#define PCI_BASE_ADDRESS_2	0x18	/* 32 bits [htype 0 only] */
-#define PCI_BASE_ADDRESS_3	0x1c	/* 32 bits */
-#define PCI_BASE_ADDRESS_4	0x20	/* 32 bits */
-#define PCI_BASE_ADDRESS_5	0x24	/* 32 bits */
-#define  PCI_BASE_ADDRESS_SPACE	0x01	/* 0 = memory, 1 = I/O */
-#define  PCI_BASE_ADDRESS_SPACE_IO 0x01
-#define  PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
-#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06
-#define  PCI_BASE_ADDRESS_MEM_TYPE_32	0x00	/* 32 bit address */
-#define  PCI_BASE_ADDRESS_MEM_TYPE_1M	0x02	/* Below 1M [obsolete] */
-#define  PCI_BASE_ADDRESS_MEM_TYPE_64	0x04	/* 64 bit address */
-#define  PCI_BASE_ADDRESS_MEM_PREFETCH	0x08	/* prefetchable? */
-#define  PCI_BASE_ADDRESS_MEM_MASK	(~(pciaddr_t)0x0f)
-#define  PCI_BASE_ADDRESS_IO_MASK	(~(pciaddr_t)0x03)
-/* bit 1 is reserved if address_space = 1 */
-
-/* Header type 0 (normal devices) */
-#define PCI_CARDBUS_CIS		0x28
-#define PCI_SUBSYSTEM_VENDOR_ID	0x2c
-#define PCI_SUBSYSTEM_ID	0x2e
-#define PCI_ROM_ADDRESS		0x30	/* Bits 31..11 are address, 10..1 reserved */
-#define  PCI_ROM_ADDRESS_ENABLE	0x01
-#define PCI_ROM_ADDRESS_MASK	(~(pciaddr_t)0x7ff)
-
-#define PCI_CAPABILITY_LIST	0x34	/* Offset of first capability list entry */
-
-/* 0x35-0x3b are reserved */
-#define PCI_INTERRUPT_LINE	0x3c	/* 8 bits */
-#define PCI_INTERRUPT_PIN	0x3d	/* 8 bits */
-#define PCI_MIN_GNT		0x3e	/* 8 bits */
-#define PCI_MAX_LAT		0x3f	/* 8 bits */
-
-/* Header type 1 (PCI-to-PCI bridges) */
-#define PCI_PRIMARY_BUS		0x18	/* Primary bus number */
-#define PCI_SECONDARY_BUS	0x19	/* Secondary bus number */
-#define PCI_SUBORDINATE_BUS	0x1a	/* Highest bus number behind the bridge */
-#define PCI_SEC_LATENCY_TIMER	0x1b	/* Latency timer for secondary interface */
-#define PCI_IO_BASE		0x1c	/* I/O range behind the bridge */
-#define PCI_IO_LIMIT		0x1d
-#define  PCI_IO_RANGE_TYPE_MASK	0x0f	/* I/O bridging type */
-#define  PCI_IO_RANGE_TYPE_16	0x00
-#define  PCI_IO_RANGE_TYPE_32	0x01
-#define  PCI_IO_RANGE_MASK	~0x0f
-#define PCI_SEC_STATUS		0x1e	/* Secondary status register */
-#define PCI_MEMORY_BASE		0x20	/* Memory range behind */
-#define PCI_MEMORY_LIMIT	0x22
-#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0f
-#define  PCI_MEMORY_RANGE_MASK	~0x0f
-#define PCI_PREF_MEMORY_BASE	0x24	/* Prefetchable memory range behind */
-#define PCI_PREF_MEMORY_LIMIT	0x26
-#define  PCI_PREF_RANGE_TYPE_MASK 0x0f
-#define  PCI_PREF_RANGE_TYPE_32	0x00
-#define  PCI_PREF_RANGE_TYPE_64	0x01
-#define  PCI_PREF_RANGE_MASK	~0x0f
-#define PCI_PREF_BASE_UPPER32	0x28	/* Upper half of prefetchable memory range */
-#define PCI_PREF_LIMIT_UPPER32	0x2c
-#define PCI_IO_BASE_UPPER16	0x30	/* Upper half of I/O addresses */
-#define PCI_IO_LIMIT_UPPER16	0x32
-/* 0x34 same as for htype 0 */
-/* 0x35-0x3b is reserved */
-#define PCI_ROM_ADDRESS1	0x38	/* Same as PCI_ROM_ADDRESS, but for htype 1 */
-/* 0x3c-0x3d are same as for htype 0 */
-#define PCI_BRIDGE_CONTROL	0x3e
-#define  PCI_BRIDGE_CTL_PARITY	0x01	/* Enable parity detection on secondary interface */
-#define  PCI_BRIDGE_CTL_SERR	0x02	/* The same for SERR forwarding */
-#define  PCI_BRIDGE_CTL_NO_ISA	0x04	/* Disable bridging of ISA ports */
-#define  PCI_BRIDGE_CTL_VGA	0x08	/* Forward VGA addresses */
-#define  PCI_BRIDGE_CTL_MASTER_ABORT 0x20  /* Report master aborts */
-#define  PCI_BRIDGE_CTL_BUS_RESET 0x40	/* Secondary bus reset */
-#define  PCI_BRIDGE_CTL_FAST_BACK 0x80	/* Fast Back2Back enabled on secondary interface */
-#define  PCI_BRIDGE_CTL_PRI_DISCARD_TIMER 0x100		/* PCI-X? */
-#define  PCI_BRIDGE_CTL_SEC_DISCARD_TIMER 0x200		/* PCI-X? */
-#define  PCI_BRIDGE_CTL_DISCARD_TIMER_STATUS 0x400	/* PCI-X? */
-#define  PCI_BRIDGE_CTL_DISCARD_TIMER_SERR_EN 0x800	/* PCI-X? */
-
-/* Header type 2 (CardBus bridges) */
-/* 0x14-0x15 reserved */
-#define PCI_CB_SEC_STATUS	0x16	/* Secondary status */
-#define PCI_CB_PRIMARY_BUS	0x18	/* PCI bus number */
-#define PCI_CB_CARD_BUS		0x19	/* CardBus bus number */
-#define PCI_CB_SUBORDINATE_BUS	0x1a	/* Subordinate bus number */
-#define PCI_CB_LATENCY_TIMER	0x1b	/* CardBus latency timer */
-#define PCI_CB_MEMORY_BASE_0	0x1c
-#define PCI_CB_MEMORY_LIMIT_0	0x20
-#define PCI_CB_MEMORY_BASE_1	0x24
-#define PCI_CB_MEMORY_LIMIT_1	0x28
-#define PCI_CB_IO_BASE_0	0x2c
-#define PCI_CB_IO_BASE_0_HI	0x2e
-#define PCI_CB_IO_LIMIT_0	0x30
-#define PCI_CB_IO_LIMIT_0_HI	0x32
-#define PCI_CB_IO_BASE_1	0x34
-#define PCI_CB_IO_BASE_1_HI	0x36
-#define PCI_CB_IO_LIMIT_1	0x38
-#define PCI_CB_IO_LIMIT_1_HI	0x3a
-#define  PCI_CB_IO_RANGE_MASK	~0x03
-/* 0x3c-0x3d are same as for htype 0 */
-#define PCI_CB_BRIDGE_CONTROL	0x3e
-#define  PCI_CB_BRIDGE_CTL_PARITY	0x01	/* Similar to standard bridge control register */
-#define  PCI_CB_BRIDGE_CTL_SERR		0x02
-#define  PCI_CB_BRIDGE_CTL_ISA		0x04
-#define  PCI_CB_BRIDGE_CTL_VGA		0x08
-#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT	0x20
-#define  PCI_CB_BRIDGE_CTL_CB_RESET	0x40	/* CardBus reset */
-#define  PCI_CB_BRIDGE_CTL_16BIT_INT	0x80	/* Enable interrupt for 16-bit cards */
-#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100	/* Prefetch enable for both memory regions */
-#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
-#define  PCI_CB_BRIDGE_CTL_POST_WRITES	0x400
-#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40
-#define PCI_CB_SUBSYSTEM_ID	0x42
-#define PCI_CB_LEGACY_MODE_BASE	0x44	/* 16-bit PC Card legacy mode base address (ExCa) */
-/* 0x48-0x7f reserved */
-
-/* Capability lists */
-
-#define PCI_CAP_LIST_ID		0	/* Capability ID */
-#define  PCI_CAP_ID_PM		0x01	/* Power Management */
-#define  PCI_CAP_ID_AGP		0x02	/* Accelerated Graphics Port */
-#define  PCI_CAP_ID_VPD		0x03	/* Vital Product Data */
-#define  PCI_CAP_ID_SLOTID	0x04	/* Slot Identification */
-#define  PCI_CAP_ID_MSI		0x05	/* Message Signaled Interrupts */
-#define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
-#define  PCI_CAP_ID_PCIX        0x07    /* PCI-X */
-#define  PCI_CAP_ID_HT          0x08    /* HyperTransport */
-#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific */
-#define  PCI_CAP_ID_DBG		0x0A	/* Debug port */
-#define  PCI_CAP_ID_CCRC	0x0B	/* CompactPCI Central Resource Control */
-#define  PCI_CAP_ID_HOTPLUG	0x0C	/* PCI hot-plug */
-#define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
-#define  PCI_CAP_ID_AGP3	0x0E	/* AGP 8x */
-#define  PCI_CAP_ID_SECURE	0x0F	/* Secure device (?) */
-#define  PCI_CAP_ID_EXP		0x10	/* PCI Express */
-#define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-#define  PCI_CAP_ID_SATA	0x12	/* Serial-ATA HBA */
-#define  PCI_CAP_ID_AF		0x13	/* Advanced features of PCI devices integrated in PCIe root cplx */
-#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
-#define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
-#define PCI_CAP_SIZEOF		4
-
-/* Capabilities residing in the PCI Express extended configuration space */
-
-#define PCI_EXT_CAP_ID_AER	0x01	/* Advanced Error Reporting */
-#define PCI_EXT_CAP_ID_VC	0x02	/* Virtual Channel */
-#define PCI_EXT_CAP_ID_DSN	0x03	/* Device Serial Number */
-#define PCI_EXT_CAP_ID_PB	0x04	/* Power Budgeting */
-#define PCI_EXT_CAP_ID_RCLINK	0x05	/* Root Complex Link Declaration */
-#define PCI_EXT_CAP_ID_RCILINK	0x06	/* Root Complex Internal Link Declaration */
-#define PCI_EXT_CAP_ID_RCECOLL	0x07	/* Root Complex Event Collector */
-#define PCI_EXT_CAP_ID_MFVC	0x08	/* Multi-Function Virtual Channel */
-#define PCI_EXT_CAP_ID_VC2	0x09	/* Virtual Channel (2nd ID) */
-#define PCI_EXT_CAP_ID_RBCB	0x0a	/* Root Bridge Control Block */
-#define PCI_EXT_CAP_ID_VNDR	0x0b	/* Vendor specific */
-#define PCI_EXT_CAP_ID_ACS	0x0d	/* Access Controls */
-#define PCI_EXT_CAP_ID_ARI	0x0e	/* Alternative Routing-ID Interpretation */
-#define PCI_EXT_CAP_ID_ATS	0x0f	/* Address Translation Service */
-#define PCI_EXT_CAP_ID_SRIOV	0x10	/* Single Root I/O Virtualization */
-#define PCI_EXT_CAP_ID_TPH	0x17	/* Transaction processing hints */
-#define PCI_EXT_CAP_ID_LTR	0x18	/* Latency Tolerance Reporting */
-
-/*** Definitions of capabilities ***/
-
-/* Power Management Registers */
-
-#define  PCI_PM_CAP_VER_MASK	0x0007	/* Version (2=PM1.1) */
-#define  PCI_PM_CAP_PME_CLOCK	0x0008	/* Clock required for PME generation */
-#define  PCI_PM_CAP_DSI		0x0020	/* Device specific initialization required */
-#define  PCI_PM_CAP_AUX_C_MASK	0x01c0	/* Maximum aux current required in D3cold */
-#define  PCI_PM_CAP_D1		0x0200	/* D1 power state support */
-#define  PCI_PM_CAP_D2		0x0400	/* D2 power state support */
-#define  PCI_PM_CAP_PME_D0	0x0800	/* PME can be asserted from D0 */
-#define  PCI_PM_CAP_PME_D1	0x1000	/* PME can be asserted from D1 */
-#define  PCI_PM_CAP_PME_D2	0x2000	/* PME can be asserted from D2 */
-#define  PCI_PM_CAP_PME_D3_HOT	0x4000	/* PME can be asserted from D3hot */
-#define  PCI_PM_CAP_PME_D3_COLD	0x8000	/* PME can be asserted from D3cold */
-#define PCI_PM_CTRL		4	/* PM control and status register */
-#define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RST	0x0008	/* No Soft Reset from D3hot to D0 */
-#define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
-#define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* PM table data index */
-#define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* PM table data scaling factor */
-#define  PCI_PM_CTRL_PME_STATUS	0x8000	/* PME pin status */
-#define PCI_PM_PPB_EXTENSIONS	6	/* PPB support extensions */
-#define  PCI_PM_PPB_B2_B3	0x40	/* If bridge enters D3hot, bus enters: 0=B3, 1=B2 */
-#define  PCI_PM_BPCC_ENABLE	0x80	/* Secondary bus is power managed */
-#define PCI_PM_DATA_REGISTER	7	/* PM table contents read here */
-#define PCI_PM_SIZEOF		8
-
-/* AGP registers */
-
-#define PCI_AGP_VERSION		2	/* BCD version number */
-#define PCI_AGP_RFU		3	/* Rest of capability flags */
-#define PCI_AGP_STATUS		4	/* Status register */
-#define  PCI_AGP_STATUS_RQ_MASK	0xff000000	/* Maximum number of requests - 1 */
-#define  PCI_AGP_STATUS_ISOCH	0x10000	/* Isochronous transactions supported */
-#define  PCI_AGP_STATUS_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
-#define  PCI_AGP_STATUS_CAL_MASK	0x1c00	/* Calibration cycle timing */
-#define  PCI_AGP_STATUS_SBA	0x0200	/* Sideband addressing supported */
-#define  PCI_AGP_STATUS_ITA_COH	0x0100	/* In-aperture accesses always coherent */
-#define  PCI_AGP_STATUS_GART64	0x0080	/* 64-bit GART entries supported */
-#define  PCI_AGP_STATUS_HTRANS	0x0040	/* If 0, core logic can xlate host CPU accesses thru aperture */
-#define  PCI_AGP_STATUS_64BIT	0x0020	/* 64-bit addressing cycles supported */
-#define  PCI_AGP_STATUS_FW	0x0010	/* Fast write transfers supported */
-#define  PCI_AGP_STATUS_AGP3	0x0008	/* AGP3 mode supported */
-#define  PCI_AGP_STATUS_RATE4	0x0004	/* 4x transfer rate supported (RFU in AGP3 mode) */
-#define  PCI_AGP_STATUS_RATE2	0x0002	/* 2x transfer rate supported (8x in AGP3 mode) */
-#define  PCI_AGP_STATUS_RATE1	0x0001	/* 1x transfer rate supported (4x in AGP3 mode) */
-#define PCI_AGP_COMMAND		8	/* Control register */
-#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of requests */
-#define  PCI_AGP_COMMAND_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
-#define  PCI_AGP_COMMAND_CAL_MASK	0x1c00	/* Calibration cycle timing */
-#define  PCI_AGP_COMMAND_SBA	0x0200	/* Sideband addressing enabled */
-#define  PCI_AGP_COMMAND_AGP	0x0100	/* Allow processing of AGP transactions */
-#define  PCI_AGP_COMMAND_GART64	0x0080	/* 64-bit GART entries enabled */
-#define  PCI_AGP_COMMAND_64BIT	0x0020 	/* Allow generation of 64-bit addr cycles */
-#define  PCI_AGP_COMMAND_FW	0x0010 	/* Enable FW transfers */
-#define  PCI_AGP_COMMAND_RATE4	0x0004	/* Use 4x rate (RFU in AGP3 mode) */
-#define  PCI_AGP_COMMAND_RATE2	0x0002	/* Use 2x rate (8x in AGP3 mode) */
-#define  PCI_AGP_COMMAND_RATE1	0x0001	/* Use 1x rate (4x in AGP3 mode) */
-#define PCI_AGP_SIZEOF		12
-
-/* Vital Product Data */
-
-#define PCI_VPD_ADDR		2	/* Address to access (15 bits!) */
-#define  PCI_VPD_ADDR_MASK	0x7fff	/* Address mask */
-#define  PCI_VPD_ADDR_F		0x8000	/* Write 0, 1 indicates completion */
-#define PCI_VPD_DATA		4	/* 32-bits of data returned here */
-
-/* Slot Identification */
-
-#define PCI_SID_ESR		2	/* Expansion Slot Register */
-#define  PCI_SID_ESR_NSLOTS	0x1f	/* Number of expansion slots available */
-#define  PCI_SID_ESR_FIC	0x20	/* First In Chassis Flag */
-#define PCI_SID_CHASSIS_NR	3	/* Chassis Number */
-
-/* Message Signaled Interrupts registers */
-
-#define PCI_MSI_FLAGS		2	/* Various flags */
-#define  PCI_MSI_FLAGS_MASK_BIT	0x100	/* interrupt masking & reporting supported */
-#define  PCI_MSI_FLAGS_64BIT	0x080	/* 64-bit addresses allowed */
-#define  PCI_MSI_FLAGS_QSIZE	0x070	/* Message queue size configured */
-#define  PCI_MSI_FLAGS_QMASK	0x00e	/* Maximum queue size available */
-#define  PCI_MSI_FLAGS_ENABLE	0x001	/* MSI feature enabled */
-#define PCI_MSI_RFU		3	/* Rest of capability flags */
-#define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
-#define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
-#define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
-#define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
-#define PCI_MSI_MASK_BIT_32	12	/* per-vector masking for 32-bit devices */
-#define PCI_MSI_MASK_BIT_64	16	/* per-vector masking for 64-bit devices */
-#define PCI_MSI_PENDING_32	16	/* per-vector interrupt pending for 32-bit devices */
-#define PCI_MSI_PENDING_64	20	/* per-vector interrupt pending for 64-bit devices */
-
-/* PCI-X */
-#define PCI_PCIX_COMMAND                                                2 /* Command register offset */
-#define PCI_PCIX_COMMAND_DPERE                                     0x0001 /* Data Parity Error Recover Enable */
-#define PCI_PCIX_COMMAND_ERO                                       0x0002 /* Enable Relaxed Ordering */
-#define PCI_PCIX_COMMAND_MAX_MEM_READ_BYTE_COUNT                   0x000c /* Maximum Memory Read Byte Count */
-#define PCI_PCIX_COMMAND_MAX_OUTSTANDING_SPLIT_TRANS               0x0070
-#define PCI_PCIX_COMMAND_RESERVED                                   0xf80
-#define PCI_PCIX_STATUS                                                 4 /* Status register offset */
-#define PCI_PCIX_STATUS_FUNCTION                               0x00000007
-#define PCI_PCIX_STATUS_DEVICE                                 0x000000f8
-#define PCI_PCIX_STATUS_BUS                                    0x0000ff00
-#define PCI_PCIX_STATUS_64BIT                                  0x00010000
-#define PCI_PCIX_STATUS_133MHZ                                 0x00020000
-#define PCI_PCIX_STATUS_SC_DISCARDED                           0x00040000 /* Split Completion Discarded */
-#define PCI_PCIX_STATUS_UNEXPECTED_SC                          0x00080000 /* Unexpected Split Completion */
-#define PCI_PCIX_STATUS_DEVICE_COMPLEXITY                      0x00100000 /* 0 = simple device, 1 = bridge device */
-#define PCI_PCIX_STATUS_DESIGNED_MAX_MEM_READ_BYTE_COUNT       0x00600000 /* 0 = 512 bytes, 1 = 1024, 2 = 2048, 3 = 4096 */
-#define PCI_PCIX_STATUS_DESIGNED_MAX_OUTSTANDING_SPLIT_TRANS   0x03800000
-#define PCI_PCIX_STATUS_DESIGNED_MAX_CUMULATIVE_READ_SIZE      0x1c000000
-#define PCI_PCIX_STATUS_RCVD_SC_ERR_MESS                       0x20000000 /* Received Split Completion Error Message */
-#define PCI_PCIX_STATUS_266MHZ				       0x40000000 /* 266 MHz capable */
-#define PCI_PCIX_STATUS_533MHZ				       0x80000000 /* 533 MHz capable */
-#define PCI_PCIX_SIZEOF		4
-
-/* PCI-X Bridges */
-#define PCI_PCIX_BRIDGE_SEC_STATUS                                      2 /* Secondary bus status register offset */
-#define PCI_PCIX_BRIDGE_SEC_STATUS_64BIT                           0x0001
-#define PCI_PCIX_BRIDGE_SEC_STATUS_133MHZ                          0x0002
-#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_DISCARDED                    0x0004 /* Split Completion Discarded on secondary bus */
-#define PCI_PCIX_BRIDGE_SEC_STATUS_UNEXPECTED_SC                   0x0008 /* Unexpected Split Completion on secondary bus */
-#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_OVERRUN                      0x0010 /* Split Completion Overrun on secondary bus */
-#define PCI_PCIX_BRIDGE_SEC_STATUS_SPLIT_REQUEST_DELAYED           0x0020
-#define PCI_PCIX_BRIDGE_SEC_STATUS_CLOCK_FREQ                      0x01c0
-#define PCI_PCIX_BRIDGE_SEC_STATUS_RESERVED                        0xfe00
-#define PCI_PCIX_BRIDGE_STATUS                                          4 /* Primary bus status register offset */
-#define PCI_PCIX_BRIDGE_STATUS_FUNCTION                        0x00000007
-#define PCI_PCIX_BRIDGE_STATUS_DEVICE                          0x000000f8
-#define PCI_PCIX_BRIDGE_STATUS_BUS                             0x0000ff00
-#define PCI_PCIX_BRIDGE_STATUS_64BIT                           0x00010000
-#define PCI_PCIX_BRIDGE_STATUS_133MHZ                          0x00020000
-#define PCI_PCIX_BRIDGE_STATUS_SC_DISCARDED                    0x00040000 /* Split Completion Discarded */
-#define PCI_PCIX_BRIDGE_STATUS_UNEXPECTED_SC                   0x00080000 /* Unexpected Split Completion */
-#define PCI_PCIX_BRIDGE_STATUS_SC_OVERRUN                      0x00100000 /* Split Completion Overrun */
-#define PCI_PCIX_BRIDGE_STATUS_SPLIT_REQUEST_DELAYED           0x00200000
-#define PCI_PCIX_BRIDGE_STATUS_RESERVED                        0xffc00000
-#define PCI_PCIX_BRIDGE_UPSTREAM_SPLIT_TRANS_CTRL                       8 /* Upstream Split Transaction Register offset */
-#define PCI_PCIX_BRIDGE_DOWNSTREAM_SPLIT_TRANS_CTRL                    12 /* Downstream Split Transaction Register offset */
-#define PCI_PCIX_BRIDGE_STR_CAPACITY                           0x0000ffff
-#define PCI_PCIX_BRIDGE_STR_COMMITMENT_LIMIT                   0xffff0000
-#define PCI_PCIX_BRIDGE_SIZEOF 12
-
-/* HyperTransport (as of spec rev. 2.00) */
-#define PCI_HT_CMD		2	/* Command Register */
-#define  PCI_HT_CMD_TYP_HI	0xe000	/* Capability Type high part */
-#define  PCI_HT_CMD_TYP_HI_PRI	0x0000	/* Slave or Primary Interface */
-#define  PCI_HT_CMD_TYP_HI_SEC	0x2000	/* Host or Secondary Interface */
-#define  PCI_HT_CMD_TYP		0xf800	/* Capability Type */
-#define  PCI_HT_CMD_TYP_SW	0x4000	/* Switch */
-#define  PCI_HT_CMD_TYP_IDC	0x8000	/* Interrupt Discovery and Configuration */
-#define  PCI_HT_CMD_TYP_RID	0x8800	/* Revision ID */
-#define  PCI_HT_CMD_TYP_UIDC	0x9000	/* UnitID Clumping */
-#define  PCI_HT_CMD_TYP_ECSA	0x9800	/* Extended Configuration Space Access */
-#define  PCI_HT_CMD_TYP_AM	0xa000	/* Address Mapping */
-#define  PCI_HT_CMD_TYP_MSIM	0xa800	/* MSI Mapping */
-#define  PCI_HT_CMD_TYP_DR	0xb000	/* DirectRoute */
-#define  PCI_HT_CMD_TYP_VCS	0xb800	/* VCSet */
-#define  PCI_HT_CMD_TYP_RM	0xc000	/* Retry Mode */
-#define  PCI_HT_CMD_TYP_X86	0xc800	/* X86 (reserved) */
-
-					/* Link Control Register */
-#define  PCI_HT_LCTR_CFLE	0x0002	/* CRC Flood Enable */
-#define  PCI_HT_LCTR_CST	0x0004	/* CRC Start Test */
-#define  PCI_HT_LCTR_CFE	0x0008	/* CRC Force Error */
-#define  PCI_HT_LCTR_LKFAIL	0x0010	/* Link Failure */
-#define  PCI_HT_LCTR_INIT	0x0020	/* Initialization Complete */
-#define  PCI_HT_LCTR_EOC	0x0040	/* End of Chain */
-#define  PCI_HT_LCTR_TXO	0x0080	/* Transmitter Off */
-#define  PCI_HT_LCTR_CRCERR	0x0f00	/* CRC Error */
-#define  PCI_HT_LCTR_ISOCEN	0x1000	/* Isochronous Flow Control Enable */
-#define  PCI_HT_LCTR_LSEN	0x2000	/* LDTSTOP# Tristate Enable */
-#define  PCI_HT_LCTR_EXTCTL	0x4000	/* Extended CTL Time */
-#define  PCI_HT_LCTR_64B	0x8000	/* 64-bit Addressing Enable */
-
-					/* Link Configuration Register */
-#define  PCI_HT_LCNF_MLWI	0x0007	/* Max Link Width In */
-#define  PCI_HT_LCNF_LW_8B	0x0	/* Link Width 8 bits */
-#define  PCI_HT_LCNF_LW_16B	0x1	/* Link Width 16 bits */
-#define  PCI_HT_LCNF_LW_32B	0x3	/* Link Width 32 bits */
-#define  PCI_HT_LCNF_LW_2B	0x4	/* Link Width 2 bits */
-#define  PCI_HT_LCNF_LW_4B	0x5	/* Link Width 4 bits */
-#define  PCI_HT_LCNF_LW_NC	0x7	/* Link physically not connected */
-#define  PCI_HT_LCNF_DFI	0x0008	/* Doubleword Flow Control In */
-#define  PCI_HT_LCNF_MLWO	0x0070	/* Max Link Width Out */
-#define  PCI_HT_LCNF_DFO	0x0080	/* Doubleword Flow Control Out */
-#define  PCI_HT_LCNF_LWI	0x0700	/* Link Width In */
-#define  PCI_HT_LCNF_DFIE	0x0800	/* Doubleword Flow Control In Enable */
-#define  PCI_HT_LCNF_LWO	0x7000	/* Link Width Out */
-#define  PCI_HT_LCNF_DFOE	0x8000	/* Doubleword Flow Control Out Enable */
-
-					/* Revision ID Register */
-#define  PCI_HT_RID_MIN		0x1f	/* Minor Revision */
-#define  PCI_HT_RID_MAJ		0xe0	/* Major Revision */
-
-					/* Link Frequency/Error Register */
-#define  PCI_HT_LFRER_FREQ	0x0f	/* Transmitter Clock Frequency */
-#define  PCI_HT_LFRER_200	0x00	/* 200MHz */
-#define  PCI_HT_LFRER_300	0x01	/* 300MHz */
-#define  PCI_HT_LFRER_400	0x02	/* 400MHz */
-#define  PCI_HT_LFRER_500	0x03	/* 500MHz */
-#define  PCI_HT_LFRER_600	0x04	/* 600MHz */
-#define  PCI_HT_LFRER_800	0x05	/* 800MHz */
-#define  PCI_HT_LFRER_1000	0x06	/* 1.0GHz */
-#define  PCI_HT_LFRER_1200	0x07	/* 1.2GHz */
-#define  PCI_HT_LFRER_1400	0x08	/* 1.4GHz */
-#define  PCI_HT_LFRER_1600	0x09	/* 1.6GHz */
-#define  PCI_HT_LFRER_VEND	0x0f	/* Vendor-Specific */
-#define  PCI_HT_LFRER_ERR	0xf0	/* Link Error */
-#define  PCI_HT_LFRER_PROT	0x10	/* Protocol Error */
-#define  PCI_HT_LFRER_OV	0x20	/* Overflow Error */
-#define  PCI_HT_LFRER_EOC	0x40	/* End of Chain Error */
-#define  PCI_HT_LFRER_CTLT	0x80	/* CTL Timeout */
-
-					/* Link Frequency Capability Register */
-#define  PCI_HT_LFCAP_200	0x0001	/* 200MHz */
-#define  PCI_HT_LFCAP_300	0x0002	/* 300MHz */
-#define  PCI_HT_LFCAP_400	0x0004	/* 400MHz */
-#define  PCI_HT_LFCAP_500	0x0008	/* 500MHz */
-#define  PCI_HT_LFCAP_600	0x0010	/* 600MHz */
-#define  PCI_HT_LFCAP_800	0x0020	/* 800MHz */
-#define  PCI_HT_LFCAP_1000	0x0040	/* 1.0GHz */
-#define  PCI_HT_LFCAP_1200	0x0080	/* 1.2GHz */
-#define  PCI_HT_LFCAP_1400	0x0100	/* 1.4GHz */
-#define  PCI_HT_LFCAP_1600	0x0200	/* 1.6GHz */
-#define  PCI_HT_LFCAP_VEND	0x8000	/* Vendor-Specific */
-
-					/* Feature Register */
-#define  PCI_HT_FTR_ISOCFC	0x0001	/* Isochronous Flow Control Mode */
-#define  PCI_HT_FTR_LDTSTOP	0x0002	/* LDTSTOP# Supported */
-#define  PCI_HT_FTR_CRCTM	0x0004	/* CRC Test Mode */
-#define  PCI_HT_FTR_ECTLT	0x0008	/* Extended CTL Time Required */
-#define  PCI_HT_FTR_64BA	0x0010	/* 64-bit Addressing */
-#define  PCI_HT_FTR_UIDRD	0x0020	/* UnitID Reorder Disable */
-
-					/* Error Handling Register */
-#define  PCI_HT_EH_PFLE		0x0001	/* Protocol Error Flood Enable */
-#define  PCI_HT_EH_OFLE		0x0002	/* Overflow Error Flood Enable */
-#define  PCI_HT_EH_PFE		0x0004	/* Protocol Error Fatal Enable */
-#define  PCI_HT_EH_OFE		0x0008	/* Overflow Error Fatal Enable */
-#define  PCI_HT_EH_EOCFE	0x0010	/* End of Chain Error Fatal Enable */
-#define  PCI_HT_EH_RFE		0x0020	/* Response Error Fatal Enable */
-#define  PCI_HT_EH_CRCFE	0x0040	/* CRC Error Fatal Enable */
-#define  PCI_HT_EH_SERRFE	0x0080	/* System Error Fatal Enable (B */
-#define  PCI_HT_EH_CF		0x0100	/* Chain Fail */
-#define  PCI_HT_EH_RE		0x0200	/* Response Error */
-#define  PCI_HT_EH_PNFE		0x0400	/* Protocol Error Nonfatal Enable */
-#define  PCI_HT_EH_ONFE		0x0800	/* Overflow Error Nonfatal Enable */
-#define  PCI_HT_EH_EOCNFE	0x1000	/* End of Chain Error Nonfatal Enable */
-#define  PCI_HT_EH_RNFE		0x2000	/* Response Error Nonfatal Enable */
-#define  PCI_HT_EH_CRCNFE	0x4000	/* CRC Error Nonfatal Enable */
-#define  PCI_HT_EH_SERRNFE	0x8000	/* System Error Nonfatal Enable */
-
-/* HyperTransport: Slave or Primary Interface */
-#define PCI_HT_PRI_CMD		2	/* Command Register */
-#define  PCI_HT_PRI_CMD_BUID	0x001f	/* Base UnitID */
-#define  PCI_HT_PRI_CMD_UC	0x03e0	/* Unit Count */
-#define  PCI_HT_PRI_CMD_MH	0x0400	/* Master Host */
-#define  PCI_HT_PRI_CMD_DD	0x0800	/* Default Direction */
-#define  PCI_HT_PRI_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
-
-#define PCI_HT_PRI_LCTR0	4	/* Link Control 0 Register */
-#define PCI_HT_PRI_LCNF0	6	/* Link Config 0 Register */
-#define PCI_HT_PRI_LCTR1	8	/* Link Control 1 Register */
-#define PCI_HT_PRI_LCNF1	10	/* Link Config 1 Register */
-#define PCI_HT_PRI_RID		12	/* Revision ID Register */
-#define PCI_HT_PRI_LFRER0	13	/* Link Frequency/Error 0 Register */
-#define PCI_HT_PRI_LFCAP0	14	/* Link Frequency Capability 0 Register */
-#define PCI_HT_PRI_FTR		16	/* Feature Register */
-#define PCI_HT_PRI_LFRER1	17	/* Link Frequency/Error 1 Register */
-#define PCI_HT_PRI_LFCAP1	18	/* Link Frequency Capability 1 Register */
-#define PCI_HT_PRI_ES		20	/* Enumeration Scratchpad Register */
-#define PCI_HT_PRI_EH		22	/* Error Handling Register */
-#define PCI_HT_PRI_MBU		24	/* Memory Base Upper Register */
-#define PCI_HT_PRI_MLU		25	/* Memory Limit Upper Register */
-#define PCI_HT_PRI_BN		26	/* Bus Number Register */
-#define PCI_HT_PRI_SIZEOF	28
-
-/* HyperTransport: Host or Secondary Interface */
-#define PCI_HT_SEC_CMD		2	/* Command Register */
-#define  PCI_HT_SEC_CMD_WR	0x0001	/* Warm Reset */
-#define  PCI_HT_SEC_CMD_DE	0x0002	/* Double-Ended */
-#define  PCI_HT_SEC_CMD_DN	0x0076	/* Device Number */
-#define  PCI_HT_SEC_CMD_CS	0x0080	/* Chain Side */
-#define  PCI_HT_SEC_CMD_HH	0x0100	/* Host Hide */
-#define  PCI_HT_SEC_CMD_AS	0x0400	/* Act as Slave */
-#define  PCI_HT_SEC_CMD_HIECE	0x0800	/* Host Inbound End of Chain Error */
-#define  PCI_HT_SEC_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
-
-#define PCI_HT_SEC_LCTR		4	/* Link Control Register */
-#define PCI_HT_SEC_LCNF		6	/* Link Config Register */
-#define PCI_HT_SEC_RID		8	/* Revision ID Register */
-#define PCI_HT_SEC_LFRER	9	/* Link Frequency/Error Register */
-#define PCI_HT_SEC_LFCAP	10	/* Link Frequency Capability Register */
-#define PCI_HT_SEC_FTR		12	/* Feature Register */
-#define  PCI_HT_SEC_FTR_EXTRS	0x0100	/* Extended Register Set */
-#define  PCI_HT_SEC_FTR_UCNFE	0x0200	/* Upstream Configuration Enable */
-#define PCI_HT_SEC_ES		16	/* Enumeration Scratchpad Register */
-#define PCI_HT_SEC_EH		18	/* Error Handling Register */
-#define PCI_HT_SEC_MBU		20	/* Memory Base Upper Register */
-#define PCI_HT_SEC_MLU		21	/* Memory Limit Upper Register */
-#define PCI_HT_SEC_SIZEOF	24
-
-/* HyperTransport: Switch */
-#define PCI_HT_SW_CMD		2	/* Switch Command Register */
-#define  PCI_HT_SW_CMD_VIBERR	0x0080	/* VIB Error */
-#define  PCI_HT_SW_CMD_VIBFL	0x0100	/* VIB Flood */
-#define  PCI_HT_SW_CMD_VIBFT	0x0200	/* VIB Fatal */
-#define  PCI_HT_SW_CMD_VIBNFT	0x0400	/* VIB Nonfatal */
-#define PCI_HT_SW_PMASK		4	/* Partition Mask Register */
-#define PCI_HT_SW_SWINF		8	/* Switch Info Register */
-#define  PCI_HT_SW_SWINF_DP	0x0000001f /* Default Port */
-#define  PCI_HT_SW_SWINF_EN	0x00000020 /* Enable Decode */
-#define  PCI_HT_SW_SWINF_CR	0x00000040 /* Cold Reset */
-#define  PCI_HT_SW_SWINF_PCIDX	0x00000f00 /* Performance Counter Index */
-#define  PCI_HT_SW_SWINF_BLRIDX	0x0003f000 /* Base/Limit Range Index */
-#define  PCI_HT_SW_SWINF_SBIDX	0x00002000 /* Secondary Base Range Index */
-#define  PCI_HT_SW_SWINF_HP	0x00040000 /* Hot Plug */
-#define  PCI_HT_SW_SWINF_HIDE	0x00080000 /* Hide Port */
-#define PCI_HT_SW_PCD		12	/* Performance Counter Data Register */
-#define PCI_HT_SW_BLRD		16	/* Base/Limit Range Data Register */
-#define PCI_HT_SW_SBD		20	/* Secondary Base Data Register */
-#define PCI_HT_SW_SIZEOF	24
-
-					/* Counter indices */
-#define  PCI_HT_SW_PC_PCR	0x0	/* Posted Command Receive */
-#define  PCI_HT_SW_PC_NPCR	0x1	/* Nonposted Command Receive */
-#define  PCI_HT_SW_PC_RCR	0x2	/* Response Command Receive */
-#define  PCI_HT_SW_PC_PDWR	0x3	/* Posted DW Receive */
-#define  PCI_HT_SW_PC_NPDWR	0x4	/* Nonposted DW Receive */
-#define  PCI_HT_SW_PC_RDWR	0x5	/* Response DW Receive */
-#define  PCI_HT_SW_PC_PCT	0x6	/* Posted Command Transmit */
-#define  PCI_HT_SW_PC_NPCT	0x7	/* Nonposted Command Transmit */
-#define  PCI_HT_SW_PC_RCT	0x8	/* Response Command Transmit */
-#define  PCI_HT_SW_PC_PDWT	0x9	/* Posted DW Transmit */
-#define  PCI_HT_SW_PC_NPDWT	0xa	/* Nonposted DW Transmit */
-#define  PCI_HT_SW_PC_RDWT	0xb	/* Response DW Transmit */
-
-					/* Base/Limit Range indices */
-#define  PCI_HT_SW_BLR_BASE0_LO	0x0	/* Base 0[31:1], Enable */
-#define  PCI_HT_SW_BLR_BASE0_HI	0x1	/* Base 0 Upper */
-#define  PCI_HT_SW_BLR_LIM0_LO	0x2	/* Limit 0 Lower */
-#define  PCI_HT_SW_BLR_LIM0_HI	0x3	/* Limit 0 Upper */
-
-					/* Secondary Base indices */
-#define  PCI_HT_SW_SB_LO	0x0	/* Secondary Base[31:1], Enable */
-#define  PCI_HT_SW_S0_HI	0x1	/* Secondary Base Upper */
-
-/* HyperTransport: Interrupt Discovery and Configuration */
-#define PCI_HT_IDC_IDX		2	/* Index Register */
-#define PCI_HT_IDC_DATA		4	/* Data Register */
-#define PCI_HT_IDC_SIZEOF	8
-
-					/* Register indices */
-#define  PCI_HT_IDC_IDX_LINT	0x01	/* Last Interrupt Register */
-#define   PCI_HT_IDC_LINT	0x00ff0000 /* Last interrupt definition */
-#define  PCI_HT_IDC_IDX_IDR	0x10	/* Interrupt Definition Registers */
-					/* Low part (at index) */
-#define   PCI_HT_IDC_IDR_MASK	0x10000001 /* Mask */
-#define   PCI_HT_IDC_IDR_POL	0x10000002 /* Polarity */
-#define   PCI_HT_IDC_IDR_II_2	0x1000001c /* IntrInfo[4:2]: Message Type */
-#define   PCI_HT_IDC_IDR_II_5	0x10000020 /* IntrInfo[5]: Request EOI */
-#define   PCI_HT_IDC_IDR_II_6	0x00ffffc0 /* IntrInfo[23:6] */
-#define   PCI_HT_IDC_IDR_II_24	0xff000000 /* IntrInfo[31:24] */
-					/* High part (at index + 1) */
-#define   PCI_HT_IDC_IDR_II_32	0x00ffffff /* IntrInfo[55:32] */
-#define   PCI_HT_IDC_IDR_PASSPW	0x40000000 /* PassPW setting for messages */
-#define   PCI_HT_IDC_IDR_WEOI	0x80000000 /* Waiting for EOI */
-
-/* HyperTransport: Revision ID */
-#define PCI_HT_RID_RID		2	/* Revision Register */
-#define PCI_HT_RID_SIZEOF	4
-
-/* HyperTransport: UnitID Clumping */
-#define PCI_HT_UIDC_CS		4	/* Clumping Support Register */
-#define PCI_HT_UIDC_CE		8	/* Clumping Enable Register */
-#define PCI_HT_UIDC_SIZEOF	12
-
-/* HyperTransport: Extended Configuration Space Access */
-#define PCI_HT_ECSA_ADDR	4	/* Configuration Address Register */
-#define  PCI_HT_ECSA_ADDR_REG	0x00000ffc /* Register */
-#define  PCI_HT_ECSA_ADDR_FUN	0x00007000 /* Function */
-#define  PCI_HT_ECSA_ADDR_DEV	0x000f1000 /* Device */
-#define  PCI_HT_ECSA_ADDR_BUS	0x0ff00000 /* Bus Number */
-#define  PCI_HT_ECSA_ADDR_TYPE	0x10000000 /* Access Type */
-#define PCI_HT_ECSA_DATA	8	/* Configuration Data Register */
-#define PCI_HT_ECSA_SIZEOF	12
-
-/* HyperTransport: Address Mapping */
-#define PCI_HT_AM_CMD		2	/* Command Register */
-#define  PCI_HT_AM_CMD_NDMA	0x000f	/* Number of DMA Mappings */
-#define  PCI_HT_AM_CMD_IOSIZ	0x01f0	/* I/O Size */
-#define  PCI_HT_AM_CMD_MT	0x0600	/* Map Type */
-#define  PCI_HT_AM_CMD_MT_40B	0x0000	/* 40-bit */
-#define  PCI_HT_AM_CMD_MT_64B	0x0200	/* 64-bit */
-
-					/* Window Control Register bits */
-#define  PCI_HT_AM_SBW_CTR_COMP	0x1	/* Compat */
-#define  PCI_HT_AM_SBW_CTR_NCOH	0x2	/* NonCoherent */
-#define  PCI_HT_AM_SBW_CTR_ISOC	0x4	/* Isochronous */
-#define  PCI_HT_AM_SBW_CTR_EN	0x8	/* Enable */
-
-/* HyperTransport: 40-bit Address Mapping */
-#define PCI_HT_AM40_SBNPW	4	/* Secondary Bus Non-Prefetchable Window Register */
-#define  PCI_HT_AM40_SBW_BASE	0x000fffff /* Window Base */
-#define  PCI_HT_AM40_SBW_CTR	0xf0000000 /* Window Control */
-#define PCI_HT_AM40_SBPW	8	/* Secondary Bus Prefetchable Window Register */
-#define PCI_HT_AM40_DMA_PBASE0	12	/* DMA Window Primary Base 0 Register */
-#define PCI_HT_AM40_DMA_CTR0	15	/* DMA Window Control 0 Register */
-#define  PCI_HT_AM40_DMA_CTR_CTR 0xf0	/* Window Control */
-#define PCI_HT_AM40_DMA_SLIM0	16	/* DMA Window Secondary Limit 0 Register */
-#define PCI_HT_AM40_DMA_SBASE0	18	/* DMA Window Secondary Base 0 Register */
-#define PCI_HT_AM40_SIZEOF	12	/* size is variable: 12 + 8 * NDMA */
-
-/* HyperTransport: 64-bit Address Mapping */
-#define PCI_HT_AM64_IDX		4	/* Index Register */
-#define PCI_HT_AM64_DATA_LO	8	/* Data Lower Register */
-#define PCI_HT_AM64_DATA_HI	12	/* Data Upper Register */
-#define PCI_HT_AM64_SIZEOF	16
-
-					/* Register indices */
-#define  PCI_HT_AM64_IDX_SBNPW	0x00	/* Secondary Bus Non-Prefetchable Window Register */
-#define   PCI_HT_AM64_W_BASE_LO	0xfff00000 /* Window Base Lower */
-#define   PCI_HT_AM64_W_CTR	0x0000000f /* Window Control */
-#define  PCI_HT_AM64_IDX_SBPW	0x01	/* Secondary Bus Prefetchable Window Register */
-#define   PCI_HT_AM64_IDX_PBNPW	0x02	/* Primary Bus Non-Prefetchable Window Register */
-#define   PCI_HT_AM64_IDX_DMAPB0 0x04	/* DMA Window Primary Base 0 Register */
-#define   PCI_HT_AM64_IDX_DMASB0 0x05	/* DMA Window Secondary Base 0 Register */
-#define   PCI_HT_AM64_IDX_DMASL0 0x06	/* DMA Window Secondary Limit 0 Register */
-
-/* HyperTransport: MSI Mapping */
-#define PCI_HT_MSIM_CMD		2	/* Command Register */
-#define  PCI_HT_MSIM_CMD_EN	0x0001	/* Mapping Active */
-#define  PCI_HT_MSIM_CMD_FIXD	0x0002	/* MSI Mapping Address Fixed */
-#define PCI_HT_MSIM_ADDR_LO	4	/* MSI Mapping Address Lower Register */
-#define PCI_HT_MSIM_ADDR_HI	8	/* MSI Mapping Address Upper Register */
-#define PCI_HT_MSIM_SIZEOF	12
-
-/* HyperTransport: DirectRoute */
-#define PCI_HT_DR_CMD		2	/* Command Register */
-#define  PCI_HT_DR_CMD_NDRS	0x000f	/* Number of DirectRoute Spaces */
-#define  PCI_HT_DR_CMD_IDX	0x01f0	/* Index */
-#define PCI_HT_DR_EN		4	/* Enable Vector Register */
-#define PCI_HT_DR_DATA		8	/* Data Register */
-#define PCI_HT_DR_SIZEOF	12
-
-					/* Register indices */
-#define  PCI_HT_DR_IDX_BASE_LO	0x00	/* DirectRoute Base Lower Register */
-#define   PCI_HT_DR_OTNRD	0x00000001 /* Opposite to Normal Request Direction */
-#define   PCI_HT_DR_BL_LO	0xffffff00 /* Base/Limit Lower */
-#define  PCI_HT_DR_IDX_BASE_HI	0x01	/* DirectRoute Base Upper Register */
-#define  PCI_HT_DR_IDX_LIMIT_LO	0x02	/* DirectRoute Limit Lower Register */
-#define  PCI_HT_DR_IDX_LIMIT_HI	0x03	/* DirectRoute Limit Upper Register */
-
-/* HyperTransport: VCSet */
-#define PCI_HT_VCS_SUP		4	/* VCSets Supported Register */
-#define PCI_HT_VCS_L1EN		5	/* Link 1 VCSets Enabled Register */
-#define PCI_HT_VCS_L0EN		6	/* Link 0 VCSets Enabled Register */
-#define PCI_HT_VCS_SBD		8	/* Stream Bucket Depth Register */
-#define PCI_HT_VCS_SINT		9	/* Stream Interval Register */
-#define PCI_HT_VCS_SSUP		10	/* Number of Streaming VCs Supported Register */
-#define  PCI_HT_VCS_SSUP_0	0x00	/* Streaming VC 0 */
-#define  PCI_HT_VCS_SSUP_3	0x01	/* Streaming VCs 0-3 */
-#define  PCI_HT_VCS_SSUP_15	0x02	/* Streaming VCs 0-15 */
-#define PCI_HT_VCS_NFCBD	12	/* Non-FC Bucket Depth Register */
-#define PCI_HT_VCS_NFCINT	13	/* Non-FC Bucket Interval Register */
-#define PCI_HT_VCS_SIZEOF	16
-
-/* HyperTransport: Retry Mode */
-#define PCI_HT_RM_CTR0		4	/* Control 0 Register */
-#define  PCI_HT_RM_CTR_LRETEN	0x01	/* Link Retry Enable */
-#define  PCI_HT_RM_CTR_FSER	0x02	/* Force Single Error */
-#define  PCI_HT_RM_CTR_ROLNEN	0x04	/* Rollover Nonfatal Enable */
-#define  PCI_HT_RM_CTR_FSS	0x08	/* Force Single Stomp */
-#define  PCI_HT_RM_CTR_RETNEN	0x10	/* Retry Nonfatal Enable */
-#define  PCI_HT_RM_CTR_RETFEN	0x20	/* Retry Fatal Enable */
-#define  PCI_HT_RM_CTR_AA	0xc0	/* Allowed Attempts */
-#define PCI_HT_RM_STS0		5	/* Status 0 Register */
-#define  PCI_HT_RM_STS_RETSNT	0x01	/* Retry Sent */
-#define  PCI_HT_RM_STS_CNTROL	0x02	/* Count Rollover */
-#define  PCI_HT_RM_STS_SRCV	0x04	/* Stomp Received */
-#define PCI_HT_RM_CTR1		6	/* Control 1 Register */
-#define PCI_HT_RM_STS1		7	/* Status 1 Register */
-#define PCI_HT_RM_CNT0		8	/* Retry Count 0 Register */
-#define PCI_HT_RM_CNT1		10	/* Retry Count 1 Register */
-#define PCI_HT_RM_SIZEOF	12
-
-/* Vendor-Specific Capability (see PCI_EVNDR_xxx for the PCIe version) */
-#define PCI_VNDR_LENGTH		2	/* Length byte */
-
-/* PCI Express */
-#define PCI_EXP_FLAGS		0x2	/* Capabilities register */
-#define PCI_EXP_FLAGS_VERS	0x000f	/* Capability version */
-#define PCI_EXP_FLAGS_TYPE	0x00f0	/* Device/Port type */
-#define  PCI_EXP_TYPE_ENDPOINT	0x0	/* Express Endpoint */
-#define  PCI_EXP_TYPE_LEG_END	0x1	/* Legacy Endpoint */
-#define  PCI_EXP_TYPE_ROOT_PORT 0x4	/* Root Port */
-#define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
-#define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
-#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
-#define  PCI_EXP_TYPE_PCIE_BRIDGE 0x8	/* PCI/PCI-X to PCIE Bridge */
-#define  PCI_EXP_TYPE_ROOT_INT_EP 0x9	/* Root Complex Integrated Endpoint */
-#define  PCI_EXP_TYPE_ROOT_EC 0xa	/* Root Complex Event Collector */
-#define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
-#define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
-#define PCI_EXP_DEVCAP		0x4	/* Device capabilities */
-#define  PCI_EXP_DEVCAP_PAYLOAD	0x07	/* Max_Payload_Size */
-#define  PCI_EXP_DEVCAP_PHANTOM	0x18	/* Phantom functions */
-#define  PCI_EXP_DEVCAP_EXT_TAG	0x20	/* Extended tags */
-#define  PCI_EXP_DEVCAP_L0S	0x1c0	/* L0s Acceptable Latency */
-#define  PCI_EXP_DEVCAP_L1	0xe00	/* L1 Acceptable Latency */
-#define  PCI_EXP_DEVCAP_ATN_BUT	0x1000	/* Attention Button Present */
-#define  PCI_EXP_DEVCAP_ATN_IND	0x2000	/* Attention Indicator Present */
-#define  PCI_EXP_DEVCAP_PWR_IND	0x4000	/* Power Indicator Present */
-#define  PCI_EXP_DEVCAP_RBE	0x8000	/* Role-Based Error Reporting */
-#define  PCI_EXP_DEVCAP_PWR_VAL	0x3fc0000 /* Slot Power Limit Value */
-#define  PCI_EXP_DEVCAP_PWR_SCL	0xc000000 /* Slot Power Limit Scale */
-#define  PCI_EXP_DEVCAP_FLRESET	0x10000000 /* Function-Level Reset */
-#define PCI_EXP_DEVCTL		0x8	/* Device Control */
-#define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
-#define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
-#define  PCI_EXP_DEVCTL_FERE	0x0004	/* Fatal Error Reporting Enable */
-#define  PCI_EXP_DEVCTL_URRE	0x0008	/* Unsupported Request Reporting En. */
-#define  PCI_EXP_DEVCTL_RELAXED	0x0010	/* Enable Relaxed Ordering */
-#define  PCI_EXP_DEVCTL_PAYLOAD	0x00e0	/* Max_Payload_Size */
-#define  PCI_EXP_DEVCTL_EXT_TAG	0x0100	/* Extended Tag Field Enable */
-#define  PCI_EXP_DEVCTL_PHANTOM	0x0200	/* Phantom Functions Enable */
-#define  PCI_EXP_DEVCTL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
-#define  PCI_EXP_DEVCTL_NOSNOOP	0x0800	/* Enable No Snoop */
-#define  PCI_EXP_DEVCTL_READRQ	0x7000	/* Max_Read_Request_Size */
-#define  PCI_EXP_DEVCTL_BCRE	0x8000	/* Bridge Configuration Retry Enable */
-#define  PCI_EXP_DEVCTL_FLRESET	0x8000	/* Function-Level Reset [bit shared with BCRE] */
-#define PCI_EXP_DEVSTA		0xa	/* Device Status */
-#define  PCI_EXP_DEVSTA_CED	0x01	/* Correctable Error Detected */
-#define  PCI_EXP_DEVSTA_NFED	0x02	/* Non-Fatal Error Detected */
-#define  PCI_EXP_DEVSTA_FED	0x04	/* Fatal Error Detected */
-#define  PCI_EXP_DEVSTA_URD	0x08	/* Unsupported Request Detected */
-#define  PCI_EXP_DEVSTA_AUXPD	0x10	/* AUX Power Detected */
-#define  PCI_EXP_DEVSTA_TRPND	0x20	/* Transactions Pending */
-#define PCI_EXP_LNKCAP		0xc	/* Link Capabilities */
-#define  PCI_EXP_LNKCAP_SPEED	0x0000f	/* Maximum Link Speed */
-#define  PCI_EXP_LNKCAP_WIDTH	0x003f0	/* Maximum Link Width */
-#define  PCI_EXP_LNKCAP_ASPM	0x00c00	/* Active State Power Management */
-#define  PCI_EXP_LNKCAP_L0S	0x07000	/* L0s Acceptable Latency */
-#define  PCI_EXP_LNKCAP_L1	0x38000	/* L1 Acceptable Latency */
-#define  PCI_EXP_LNKCAP_CLOCKPM	0x40000	/* Clock Power Management */
-#define  PCI_EXP_LNKCAP_SURPRISE 0x80000 /* Surprise Down Error Reporting */
-#define  PCI_EXP_LNKCAP_DLLA	0x100000 /* Data Link Layer Active Reporting */
-#define  PCI_EXP_LNKCAP_LBNC	0x200000 /* Link Bandwidth Notification Capability */
-#define  PCI_EXP_LNKCAP_PORT	0xff000000 /* Port Number */
-#define PCI_EXP_LNKCTL		0x10	/* Link Control */
-#define  PCI_EXP_LNKCTL_ASPM	0x0003	/* ASPM Control */
-#define  PCI_EXP_LNKCTL_RCB	0x0008	/* Read Completion Boundary */
-#define  PCI_EXP_LNKCTL_DISABLE	0x0010	/* Link Disable */
-#define  PCI_EXP_LNKCTL_RETRAIN	0x0020	/* Retrain Link */
-#define  PCI_EXP_LNKCTL_CLOCK	0x0040	/* Common Clock Configuration */
-#define  PCI_EXP_LNKCTL_XSYNCH	0x0080	/* Extended Synch */
-#define  PCI_EXP_LNKCTL_CLOCKPM	0x0100	/* Clock Power Management */
-#define  PCI_EXP_LNKCTL_HWAUTWD	0x0200	/* Hardware Autonomous Width Disable */
-#define  PCI_EXP_LNKCTL_BWMIE	0x0400	/* Bandwidth Mgmt Interrupt Enable */
-#define  PCI_EXP_LNKCTL_AUTBWIE	0x0800	/* Autonomous Bandwidth Mgmt Interrupt Enable */
-#define PCI_EXP_LNKSTA		0x12	/* Link Status */
-#define  PCI_EXP_LNKSTA_SPEED	0x000f	/* Negotiated Link Speed */
-#define  PCI_EXP_LNKSTA_WIDTH	0x03f0	/* Negotiated Link Width */
-#define  PCI_EXP_LNKSTA_TR_ERR	0x0400	/* Training Error (obsolete) */
-#define  PCI_EXP_LNKSTA_TRAIN	0x0800	/* Link Training */
-#define  PCI_EXP_LNKSTA_SL_CLK	0x1000	/* Slot Clock Configuration */
-#define  PCI_EXP_LNKSTA_DL_ACT	0x2000	/* Data Link Layer in DL_Active State */
-#define  PCI_EXP_LNKSTA_BWMGMT	0x4000	/* Bandwidth Mgmt Status */
-#define  PCI_EXP_LNKSTA_AUTBW	0x8000	/* Autonomous Bandwidth Mgmt Status */
-#define PCI_EXP_SLTCAP		0x14	/* Slot Capabilities */
-#define  PCI_EXP_SLTCAP_ATNB	0x0001	/* Attention Button Present */
-#define  PCI_EXP_SLTCAP_PWRC	0x0002	/* Power Controller Present */
-#define  PCI_EXP_SLTCAP_MRL	0x0004	/* MRL Sensor Present */
-#define  PCI_EXP_SLTCAP_ATNI	0x0008	/* Attention Indicator Present */
-#define  PCI_EXP_SLTCAP_PWRI	0x0010	/* Power Indicator Present */
-#define  PCI_EXP_SLTCAP_HPS	0x0020	/* Hot-Plug Surprise */
-#define  PCI_EXP_SLTCAP_HPC	0x0040	/* Hot-Plug Capable */
-#define  PCI_EXP_SLTCAP_PWR_VAL	0x00007f80 /* Slot Power Limit Value */
-#define  PCI_EXP_SLTCAP_PWR_SCL	0x00018000 /* Slot Power Limit Scale */
-#define  PCI_EXP_SLTCAP_INTERLOCK 0x020000 /* Electromechanical Interlock Present */
-#define  PCI_EXP_SLTCAP_NOCMDCOMP 0x040000 /* No Command Completed Support */
-#define  PCI_EXP_SLTCAP_PSN	0xfff80000 /* Physical Slot Number */
-#define PCI_EXP_SLTCTL		0x18	/* Slot Control */
-#define  PCI_EXP_SLTCTL_ATNB	0x0001	/* Attention Button Pressed Enable */
-#define  PCI_EXP_SLTCTL_PWRF	0x0002	/* Power Fault Detected Enable */
-#define  PCI_EXP_SLTCTL_MRLS	0x0004	/* MRL Sensor Changed Enable */
-#define  PCI_EXP_SLTCTL_PRSD	0x0008	/* Presence Detect Changed Enable */
-#define  PCI_EXP_SLTCTL_CMDC	0x0010	/* Command Completed Interrupt Enable */
-#define  PCI_EXP_SLTCTL_HPIE	0x0020	/* Hot-Plug Interrupt Enable */
-#define  PCI_EXP_SLTCTL_ATNI	0x00c0	/* Attention Indicator Control */
-#define  PCI_EXP_SLTCTL_PWRI	0x0300	/* Power Indicator Control */
-#define  PCI_EXP_SLTCTL_PWRC	0x0400	/* Power Controller Control */
-#define  PCI_EXP_SLTCTL_INTERLOCK 0x0800 /* Electromechanical Interlock Control */
-#define  PCI_EXP_SLTCTL_LLCHG	0x1000	/* Data Link Layer State Changed Enable */
-#define PCI_EXP_SLTSTA		0x1a	/* Slot Status */
-#define  PCI_EXP_SLTSTA_ATNB	0x0001	/* Attention Button Pressed */
-#define  PCI_EXP_SLTSTA_PWRF	0x0002	/* Power Fault Detected */
-#define  PCI_EXP_SLTSTA_MRLS	0x0004	/* MRL Sensor Changed */
-#define  PCI_EXP_SLTSTA_PRSD	0x0008	/* Presence Detect Changed */
-#define  PCI_EXP_SLTSTA_CMDC	0x0010	/* Command Completed */
-#define  PCI_EXP_SLTSTA_MRL_ST	0x0020	/* MRL Sensor State */
-#define  PCI_EXP_SLTSTA_PRES	0x0040	/* Presence Detect State */
-#define  PCI_EXP_SLTSTA_INTERLOCK 0x0080 /* Electromechanical Interlock Status */
-#define  PCI_EXP_SLTSTA_LLCHG	0x0100	/* Data Link Layer State Changed */
-#define PCI_EXP_RTCTL		0x1c	/* Root Control */
-#define  PCI_EXP_RTCTL_SECEE	0x0001	/* System Error on Correctable Error */
-#define  PCI_EXP_RTCTL_SENFEE	0x0002	/* System Error on Non-Fatal Error */
-#define  PCI_EXP_RTCTL_SEFEE	0x0004	/* System Error on Fatal Error */
-#define  PCI_EXP_RTCTL_PMEIE	0x0008	/* PME Interrupt Enable */
-#define  PCI_EXP_RTCTL_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
-#define PCI_EXP_RTCAP		0x1e	/* Root Capabilities */
-#define  PCI_EXP_RTCAP_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
-#define PCI_EXP_RTSTA		0x20	/* Root Status */
-#define  PCI_EXP_RTSTA_PME_REQID   0x0000ffff /* PME Requester ID */
-#define  PCI_EXP_RTSTA_PME_STATUS  0x00010000 /* PME Status */
-#define  PCI_EXP_RTSTA_PME_PENDING 0x00020000 /* PME is Pending */
-#define PCI_EXP_DEVCAP2			0x24	/* Device capabilities 2 */
-#define PCI_EXP_DEVCTL2			0x28	/* Device Control */
-#define  PCI_EXP_DEV2_TIMEOUT_RANGE(x)	((x) & 0xf) /* Completion Timeout Ranges Supported */
-#define  PCI_EXP_DEV2_TIMEOUT_VALUE(x)	((x) & 0xf) /* Completion Timeout Value */
-#define  PCI_EXP_DEV2_TIMEOUT_DIS	0x0010	/* Completion Timeout Disable Supported */
-#define  PCI_EXP_DEV2_ARI		0x0020	/* ARI Forwarding */
-#define PCI_EXP_DEVSTA2			0x2a	/* Device Status */
-#define PCI_EXP_LNKCAP2			0x2c	/* Link Capabilities */
-#define PCI_EXP_LNKCTL2			0x30	/* Link Control */
-#define  PCI_EXP_LNKCTL2_SPEED(x)	((x) & 0xf) /* Target Link Speed */
-#define  PCI_EXP_LNKCTL2_CMPLNC		0x0010	/* Enter Compliance */
-#define  PCI_EXP_LNKCTL2_SPEED_DIS	0x0020	/* Hardware Autonomous Speed Disable */
-#define  PCI_EXP_LNKCTL2_DEEMPHASIS(x)	(((x) >> 6) & 1) /* Selectable De-emphasis */
-#define  PCI_EXP_LNKCTL2_MARGIN(x)	(((x) >> 7) & 7) /* Transmit Margin */
-#define  PCI_EXP_LNKCTL2_MOD_CMPLNC	0x0400	/* Enter Modified Compliance */
-#define  PCI_EXP_LNKCTL2_CMPLNC_SOS	0x0800	/* Compliance SOS */
-#define  PCI_EXP_LNKCTL2_COM_DEEMPHASIS(x) (((x) >> 12) & 1) /* Compliance De-emphasis */
-#define PCI_EXP_LNKSTA2			0x32	/* Link Status */
-#define  PCI_EXP_LINKSTA2_DEEMPHASIS(x)	((x) & 1)	/* Current De-emphasis Level */
-#define PCI_EXP_SLTCAP2			0x34	/* Slot Capabilities */
-#define PCI_EXP_SLTCTL2			0x38	/* Slot Control */
-#define PCI_EXP_SLTSTA2			0x3a	/* Slot Status */
-
-/* MSI-X */
-#define  PCI_MSIX_ENABLE	0x8000
-#define  PCI_MSIX_MASK		0x4000
-#define  PCI_MSIX_TABSIZE	0x07ff
-#define PCI_MSIX_TABLE		4
-#define PCI_MSIX_PBA		8
-#define  PCI_MSIX_BIR		0x7
-
-/* Subsystem vendor/device ID for PCI bridges */
-#define PCI_SSVID_VENDOR	4
-#define PCI_SSVID_DEVICE	6
-
-/* PCI Advanced Features */
-#define PCI_AF_CAP		3
-#define  PCI_AF_CAP_TP		0x01
-#define  PCI_AF_CAP_FLR		0x02
-#define PCI_AF_CTRL		4
-#define  PCI_AF_CTRL_FLR	0x01
-#define PCI_AF_STATUS		5
-#define  PCI_AF_STATUS_TP	0x01
-
-/* SATA Host Bus Adapter */
-#define PCI_SATA_HBA_BARS	4
-#define PCI_SATA_HBA_REG0	8
-
-/*** Definitions of extended capabilities ***/
-
-/* Advanced Error Reporting */
-#define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
-#define  PCI_ERR_UNC_TRAIN	0x00000001	/* Undefined in PCIe rev1.1 & 2.0 spec */
-#define  PCI_ERR_UNC_DLP	0x00000010	/* Data Link Protocol */
-#define  PCI_ERR_UNC_SDES	0x00000020	/* Surprise Down Error */
-#define  PCI_ERR_UNC_POISON_TLP	0x00001000	/* Poisoned TLP */
-#define  PCI_ERR_UNC_FCP	0x00002000	/* Flow Control Protocol */
-#define  PCI_ERR_UNC_COMP_TIME	0x00004000	/* Completion Timeout */
-#define  PCI_ERR_UNC_COMP_ABORT	0x00008000	/* Completer Abort */
-#define  PCI_ERR_UNC_UNX_COMP	0x00010000	/* Unexpected Completion */
-#define  PCI_ERR_UNC_RX_OVER	0x00020000	/* Receiver Overflow */
-#define  PCI_ERR_UNC_MALF_TLP	0x00040000	/* Malformed TLP */
-#define  PCI_ERR_UNC_ECRC	0x00080000	/* ECRC Error Status */
-#define  PCI_ERR_UNC_UNSUP	0x00100000	/* Unsupported Request */
-#define  PCI_ERR_UNC_ACS_VIOL	0x00200000	/* ACS Violation */
-#define PCI_ERR_UNCOR_MASK	8	/* Uncorrectable Error Mask */
-	/* Same bits as above */
-#define PCI_ERR_UNCOR_SEVER	12	/* Uncorrectable Error Severity */
-	/* Same bits as above */
-#define PCI_ERR_COR_STATUS	16	/* Correctable Error Status */
-#define  PCI_ERR_COR_RCVR	0x00000001	/* Receiver Error Status */
-#define  PCI_ERR_COR_BAD_TLP	0x00000040	/* Bad TLP Status */
-#define  PCI_ERR_COR_BAD_DLLP	0x00000080	/* Bad DLLP Status */
-#define  PCI_ERR_COR_REP_ROLL	0x00000100	/* REPLAY_NUM Rollover */
-#define  PCI_ERR_COR_REP_TIMER	0x00001000	/* Replay Timer Timeout */
-#define  PCI_ERR_COR_REP_ANFE	0x00002000	/* Advisory Non-Fatal Error */
-#define PCI_ERR_COR_MASK	20	/* Correctable Error Mask */
-	/* Same bits as above */
-#define PCI_ERR_CAP		24	/* Advanced Error Capabilities */
-#define  PCI_ERR_CAP_FEP(x)	((x) & 31)	/* First Error Pointer */
-#define  PCI_ERR_CAP_ECRC_GENC	0x00000020	/* ECRC Generation Capable */
-#define  PCI_ERR_CAP_ECRC_GENE	0x00000040	/* ECRC Generation Enable */
-#define  PCI_ERR_CAP_ECRC_CHKC	0x00000080	/* ECRC Check Capable */
-#define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
-#define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
-#define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
-#define PCI_ERR_ROOT_STATUS	48
-#define PCI_ERR_ROOT_COR_SRC	52
-#define PCI_ERR_ROOT_SRC	54
-
-/* Virtual Channel */
-#define PCI_VC_PORT_REG1	4
-#define PCI_VC_PORT_REG2	8
-#define PCI_VC_PORT_CTRL	12
-#define PCI_VC_PORT_STATUS	14
-#define PCI_VC_RES_CAP		16
-#define PCI_VC_RES_CTRL		20
-#define PCI_VC_RES_STATUS	26
-
-/* Power Budgeting */
-#define PCI_PWR_DSR		4	/* Data Select Register */
-#define PCI_PWR_DATA		8	/* Data Register */
-#define  PCI_PWR_DATA_BASE(x)	((x) & 0xff)	    /* Base Power */
-#define  PCI_PWR_DATA_SCALE(x)	(((x) >> 8) & 3)    /* Data Scale */
-#define  PCI_PWR_DATA_PM_SUB(x)	(((x) >> 10) & 7)   /* PM Sub State */
-#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
-#define  PCI_PWR_DATA_TYPE(x)	(((x) >> 15) & 7)   /* Type */
-#define  PCI_PWR_DATA_RAIL(x)	(((x) >> 18) & 7)   /* Power Rail */
-#define PCI_PWR_CAP		12	/* Capability */
-#define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
-
-/* Root Complex Link */
-#define PCI_RCLINK_ESD		4	/* Element Self Description */
-#define PCI_RCLINK_LINK1	16	/* First Link Entry */
-#define  PCI_RCLINK_LINK_DESC	0	/* Link Entry: Description */
-#define  PCI_RCLINK_LINK_ADDR	8	/* Link Entry: Address (64-bit) */
-#define  PCI_RCLINK_LINK_SIZE	16	/* Link Entry: sizeof */
-
-/* PCIe Vendor-Specific Capability */
-#define PCI_EVNDR_HEADER	4	/* Vendor-Specific Header */
-#define PCI_EVNDR_REGISTERS	8	/* Vendor-Specific Registers */
-
-/* Access Control Services */
-#define PCI_ACS_CAP		0x04	/* ACS Capability Register */
-#define PCI_ACS_CAP_VALID	0x0001	/* ACS Source Validation */
-#define PCI_ACS_CAP_BLOCK	0x0002	/* ACS Translation Blocking */
-#define PCI_ACS_CAP_REQ_RED	0x0004	/* ACS P2P Request Redirect */
-#define PCI_ACS_CAP_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect */
-#define PCI_ACS_CAP_FORWARD	0x0010	/* ACS Upstream Forwarding */
-#define PCI_ACS_CAP_EGRESS	0x0020	/* ACS P2P Egress Control */
-#define PCI_ACS_CAP_TRANS	0x0040	/* ACS Direct Translated P2P */
-#define PCI_ACS_CAP_VECTOR(x)	(((x) >> 8) & 0xff) /* Egress Control Vector Size */
-#define PCI_ACS_CTRL		0x06	/* ACS Control Register */
-#define PCI_ACS_CTRL_VALID	0x0001	/* ACS Source Validation Enable */
-#define PCI_ACS_CTRL_BLOCK	0x0002	/* ACS Translation Blocking Enable */
-#define PCI_ACS_CTRL_REQ_RED	0x0004	/* ACS P2P Request Redirect Enable */
-#define PCI_ACS_CTRL_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect Enable */
-#define PCI_ACS_CTRL_FORWARD	0x0010	/* ACS Upstream Forwarding Enable */
-#define PCI_ACS_CTRL_EGRESS	0x0020	/* ACS P2P Egress Control Enable */
-#define PCI_ACS_CTRL_TRANS	0x0040	/* ACS Direct Translated P2P Enable */
-#define PCI_ACS_EGRESS_CTRL	0x08	/* Egress Control Vector */
-
-/* Alternative Routing-ID Interpretation */
-#define PCI_ARI_CAP		0x04	/* ARI Capability Register */
-#define  PCI_ARI_CAP_MFVC	0x0001	/* MFVC Function Groups Capability */
-#define  PCI_ARI_CAP_ACS	0x0002	/* ACS Function Groups Capability */
-#define  PCI_ARI_CAP_NFN(x)	(((x) >> 8) & 0xff) /* Next Function Number */
-#define PCI_ARI_CTRL		0x06	/* ARI Control Register */
-#define  PCI_ARI_CTRL_MFVC	0x0001	/* MFVC Function Groups Enable */
-#define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
-#define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
-
-/* Address Translation Service */
-#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
-#define  PCI_ATS_CAP_IQD(x)	((x) & 0x1f) /* Invalidate Queue Depth */
-#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
-#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f) /* Smallest Translation Unit */
-#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
-
-/* Single Root I/O Virtualization */
-#define PCI_IOV_CAP		0x04	/* SR-IOV Capability Register */
-#define  PCI_IOV_CAP_VFM	0x00000001 /* VF Migration Capable */
-#define  PCI_IOV_CAP_IMN(x)	((x) >> 21) /* VF Migration Interrupt Message Number */
-#define PCI_IOV_CTRL		0x08	/* SR-IOV Control Register */
-#define  PCI_IOV_CTRL_VFE	0x0001	/* VF Enable */
-#define  PCI_IOV_CTRL_VFME	0x0002	/* VF Migration Enable */
-#define  PCI_IOV_CTRL_VFMIE	0x0004	/* VF Migration Interrupt Enable */
-#define  PCI_IOV_CTRL_MSE	0x0008	/* VF MSE */
-#define  PCI_IOV_CTRL_ARI	0x0010	/* ARI Capable Hierarchy */
-#define PCI_IOV_STATUS		0x0a	/* SR-IOV Status Register */
-#define  PCI_IOV_STATUS_MS	0x0001	/* VF Migration Status */
-#define PCI_IOV_INITIALVF	0x0c	/* Number of VFs that are initially associated */
-#define PCI_IOV_TOTALVF		0x0e	/* Maximum number of VFs that could be associated */
-#define PCI_IOV_NUMVF		0x10	/* Number of VFs that are available */
-#define PCI_IOV_FDL		0x12	/* Function Dependency Link */
-#define PCI_IOV_OFFSET		0x14	/* First VF Offset */
-#define PCI_IOV_STRIDE		0x16	/* Routing ID offset from one VF to the next one */
-#define PCI_IOV_DID		0x1a	/* VF Device ID */
-#define PCI_IOV_SUPPS		0x1c	/* Supported Page Sizes */
-#define PCI_IOV_SYSPS		0x20	/* System Page Size */
-#define PCI_IOV_BAR_BASE	0x24	/* VF BAR0, VF BAR1, ... VF BAR5 */
-#define PCI_IOV_NUM_BAR		6	/* Number of VF BARs */
-#define PCI_IOV_MSAO		0x3c	/* VF Migration State Array Offset */
-#define PCI_IOV_MSA_BIR(x)	((x) & 7) /* VF Migration State BIR */
-#define PCI_IOV_MSA_OFFSET(x)	((x) & 0xfffffff8) /* VF Migration State Offset */
-
-/* Transaction Processing Hints */
-#define PCI_TPH_CAPABILITIES	4
-#define   PCI_TPH_INTVEC_SUP	(1<<1)	/* Supports interrupt vector mode */
-#define   PCI_TPH_DEV_SUP      	(1<<2)	/* Device specific mode supported */
-#define   PCI_TPH_EXT_REQ_SUP	(1<<8)	/* Supports extended requests */
-#define   PCI_TPH_ST_LOC_MASK	(3<<9)	/* Steering table location bits */
-#define     PCI_TPH_ST_NONE	(0<<9)	/* No steering table */
-#define     PCI_TPH_ST_CAP	(1<<9)	/* Steering table in TPH cap */
-#define     PCI_TPH_ST_MSIX	(2<<9)	/* Steering table in MSI-X table */
-#define   PCI_TPH_ST_SIZE_SHIFT	(16)	/* Encoded as size - 1 */
-
-/* Latency Tolerance Reporting */
-#define PCI_LTR_MAX_SNOOP	4	/* 16 bit value */
-#define   PCI_LTR_VALUE_MASK	(0x3ff)
-#define   PCI_LTR_SCALE_SHIFT	(10)
-#define   PCI_LTR_SCALE_MASK	(7)
-#define PCI_LTR_MAX_NOSNOOP	6	/* 16 bit value */
-
-/*
- * The PCI interface treats multi-function devices as independent
- * devices.  The slot/function address of each device is encoded
- * in a single byte as follows:
- *
- *	7:3 = slot
- *	2:0 = function
- */
-#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
-#define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
-#define PCI_FUNC(devfn)		((devfn) & 0x07)
-
-/* Device classes and subclasses */
-
-#define PCI_CLASS_NOT_DEFINED		0x0000
-#define PCI_CLASS_NOT_DEFINED_VGA	0x0001
-
-#define PCI_BASE_CLASS_STORAGE		0x01
-#define PCI_CLASS_STORAGE_SCSI		0x0100
-#define PCI_CLASS_STORAGE_IDE		0x0101
-#define PCI_CLASS_STORAGE_FLOPPY	0x0102
-#define PCI_CLASS_STORAGE_IPI		0x0103
-#define PCI_CLASS_STORAGE_RAID		0x0104
-#define PCI_CLASS_STORAGE_ATA		0x0105
-#define PCI_CLASS_STORAGE_SATA		0x0106
-#define PCI_CLASS_STORAGE_SAS		0x0107
-#define PCI_CLASS_STORAGE_OTHER		0x0180
-
-#define PCI_BASE_CLASS_NETWORK		0x02
-#define PCI_CLASS_NETWORK_ETHERNET	0x0200
-#define PCI_CLASS_NETWORK_TOKEN_RING	0x0201
-#define PCI_CLASS_NETWORK_FDDI		0x0202
-#define PCI_CLASS_NETWORK_ATM		0x0203
-#define PCI_CLASS_NETWORK_ISDN		0x0204
-#define PCI_CLASS_NETWORK_OTHER		0x0280
-
-#define PCI_BASE_CLASS_DISPLAY		0x03
-#define PCI_CLASS_DISPLAY_VGA		0x0300
-#define PCI_CLASS_DISPLAY_XGA		0x0301
-#define PCI_CLASS_DISPLAY_3D		0x0302
-#define PCI_CLASS_DISPLAY_OTHER		0x0380
-
-#define PCI_BASE_CLASS_MULTIMEDIA	0x04
-#define PCI_CLASS_MULTIMEDIA_VIDEO	0x0400
-#define PCI_CLASS_MULTIMEDIA_AUDIO	0x0401
-#define PCI_CLASS_MULTIMEDIA_PHONE	0x0402
-#define PCI_CLASS_MULTIMEDIA_AUDIO_DEV	0x0403
-#define PCI_CLASS_MULTIMEDIA_OTHER	0x0480
-
-#define PCI_BASE_CLASS_MEMORY		0x05
-#define  PCI_CLASS_MEMORY_RAM		0x0500
-#define  PCI_CLASS_MEMORY_FLASH		0x0501
-#define  PCI_CLASS_MEMORY_OTHER		0x0580
-
-#define PCI_BASE_CLASS_BRIDGE		0x06
-#define  PCI_CLASS_BRIDGE_HOST		0x0600
-#define  PCI_CLASS_BRIDGE_ISA		0x0601
-#define  PCI_CLASS_BRIDGE_EISA		0x0602
-#define  PCI_CLASS_BRIDGE_MC		0x0603
-#define  PCI_CLASS_BRIDGE_PCI		0x0604
-#define  PCI_CLASS_BRIDGE_PCMCIA	0x0605
-#define  PCI_CLASS_BRIDGE_NUBUS		0x0606
-#define  PCI_CLASS_BRIDGE_CARDBUS	0x0607
-#define  PCI_CLASS_BRIDGE_RACEWAY	0x0608
-#define  PCI_CLASS_BRIDGE_PCI_SEMI	0x0609
-#define  PCI_CLASS_BRIDGE_IB_TO_PCI	0x060a
-#define  PCI_CLASS_BRIDGE_OTHER		0x0680
-
-#define PCI_BASE_CLASS_COMMUNICATION	0x07
-#define PCI_CLASS_COMMUNICATION_SERIAL	0x0700
-#define PCI_CLASS_COMMUNICATION_PARALLEL 0x0701
-#define PCI_CLASS_COMMUNICATION_MSERIAL	0x0702
-#define PCI_CLASS_COMMUNICATION_MODEM	0x0703
-#define PCI_CLASS_COMMUNICATION_OTHER	0x0780
-
-#define PCI_BASE_CLASS_SYSTEM		0x08
-#define PCI_CLASS_SYSTEM_PIC		0x0800
-#define PCI_CLASS_SYSTEM_DMA		0x0801
-#define PCI_CLASS_SYSTEM_TIMER		0x0802
-#define PCI_CLASS_SYSTEM_RTC		0x0803
-#define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
-#define PCI_CLASS_SYSTEM_OTHER		0x0880
-
-#define PCI_BASE_CLASS_INPUT		0x09
-#define PCI_CLASS_INPUT_KEYBOARD	0x0900
-#define PCI_CLASS_INPUT_PEN		0x0901
-#define PCI_CLASS_INPUT_MOUSE		0x0902
-#define PCI_CLASS_INPUT_SCANNER		0x0903
-#define PCI_CLASS_INPUT_GAMEPORT	0x0904
-#define PCI_CLASS_INPUT_OTHER		0x0980
-
-#define PCI_BASE_CLASS_DOCKING		0x0a
-#define PCI_CLASS_DOCKING_GENERIC	0x0a00
-#define PCI_CLASS_DOCKING_OTHER		0x0a80
-
-#define PCI_BASE_CLASS_PROCESSOR	0x0b
-#define PCI_CLASS_PROCESSOR_386		0x0b00
-#define PCI_CLASS_PROCESSOR_486		0x0b01
-#define PCI_CLASS_PROCESSOR_PENTIUM	0x0b02
-#define PCI_CLASS_PROCESSOR_ALPHA	0x0b10
-#define PCI_CLASS_PROCESSOR_POWERPC	0x0b20
-#define PCI_CLASS_PROCESSOR_MIPS	0x0b30
-#define PCI_CLASS_PROCESSOR_CO		0x0b40
-
-#define PCI_BASE_CLASS_SERIAL		0x0c
-#define PCI_CLASS_SERIAL_FIREWIRE	0x0c00
-#define PCI_CLASS_SERIAL_ACCESS		0x0c01
-#define PCI_CLASS_SERIAL_SSA		0x0c02
-#define PCI_CLASS_SERIAL_USB		0x0c03
-#define PCI_CLASS_SERIAL_FIBER		0x0c04
-#define PCI_CLASS_SERIAL_SMBUS		0x0c05
-#define PCI_CLASS_SERIAL_INFINIBAND	0x0c06
-
-#define PCI_BASE_CLASS_WIRELESS		0x0d
-#define PCI_CLASS_WIRELESS_IRDA		0x0d00
-#define PCI_CLASS_WIRELESS_CONSUMER_IR	0x0d01
-#define PCI_CLASS_WIRELESS_RF		0x0d10
-#define PCI_CLASS_WIRELESS_OTHER	0x0d80
-
-#define PCI_BASE_CLASS_INTELLIGENT	0x0e
-#define PCI_CLASS_INTELLIGENT_I2O	0x0e00
-
-#define PCI_BASE_CLASS_SATELLITE	0x0f
-#define PCI_CLASS_SATELLITE_TV		0x0f00
-#define PCI_CLASS_SATELLITE_AUDIO	0x0f01
-#define PCI_CLASS_SATELLITE_VOICE	0x0f03
-#define PCI_CLASS_SATELLITE_DATA	0x0f04
-
-#define PCI_BASE_CLASS_CRYPT		0x10
-#define PCI_CLASS_CRYPT_NETWORK		0x1000
-#define PCI_CLASS_CRYPT_ENTERTAINMENT	0x1010
-#define PCI_CLASS_CRYPT_OTHER		0x1080
-
-#define PCI_BASE_CLASS_SIGNAL		0x11
-#define PCI_CLASS_SIGNAL_DPIO		0x1100
-#define PCI_CLASS_SIGNAL_PERF_CTR	0x1101
-#define PCI_CLASS_SIGNAL_SYNCHRONIZER	0x1110
-#define PCI_CLASS_SIGNAL_OTHER		0x1180
-
-#define PCI_CLASS_OTHERS		0xff
-
-/* Several ID's we need in the library */
-
-#define PCI_VENDOR_ID_INTEL		0x8086
-#define PCI_VENDOR_ID_COMPAQ		0x0e11
diff --git a/ext/hwloc/include/pci/pci.h b/ext/hwloc/include/pci/pci.h
deleted file mode 100644
index 7a5a6b80c..000000000
--- a/ext/hwloc/include/pci/pci.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- *	The PCI Library
- *
- *	Copyright (c) 1997--2009 Martin Mares <mj@ucw.cz>
- *
- *	Can be freely distributed and used under the terms of the GNU GPL.
- */
-
-#ifndef _PCI_LIB_H
-#define _PCI_LIB_H
-
-#ifndef PCI_CONFIG_H
-#include "config.h"
-#endif
-
-#include "header.h"
-#include "types.h"
-
-#define PCI_LIB_VERSION 0x030100
-
-#ifndef PCI_ABI
-#define PCI_ABI
-#endif
-
-/*
- *	PCI Access Structure
- */
-
-struct pci_methods;
-
-enum pci_access_type {
-  /* Known access methods, remember to update access.c as well */
-  PCI_ACCESS_AUTO,			/* Autodetection */
-  PCI_ACCESS_SYS_BUS_PCI,		/* Linux /sys/bus/pci */
-  PCI_ACCESS_PROC_BUS_PCI,		/* Linux /proc/bus/pci */
-  PCI_ACCESS_I386_TYPE1,		/* i386 ports, type 1 */
-  PCI_ACCESS_I386_TYPE2,		/* i386 ports, type 2 */
-  PCI_ACCESS_FBSD_DEVICE,		/* FreeBSD /dev/pci */
-  PCI_ACCESS_AIX_DEVICE,		/* /dev/pci0, /dev/bus0, etc. */
-  PCI_ACCESS_NBSD_LIBPCI,		/* NetBSD libpci */
-  PCI_ACCESS_OBSD_DEVICE,		/* OpenBSD /dev/pci */
-  PCI_ACCESS_DUMP,			/* Dump file */
-  PCI_ACCESS_MAX
-};
-
-struct pci_access {
-  /* Options you can change: */
-  unsigned int method;			/* Access method */
-  int writeable;			/* Open in read/write mode */
-  int buscentric;			/* Bus-centric view of the world */
-
-  char *id_file_name;			/* Name of ID list file (use pci_set_name_list_path()) */
-  int free_id_name;			/* Set if id_file_name is malloced */
-  int numeric_ids;			/* Enforce PCI_LOOKUP_NUMERIC (>1 => PCI_LOOKUP_MIXED) */
-
-  unsigned int id_lookup_mode;		/* pci_lookup_mode flags which are set automatically */
-					/* Default: PCI_LOOKUP_CACHE */
-
-  int debugging;			/* Turn on debugging messages */
-
-  /* Functions you can override: */
-  void (*error)(char *msg, ...) PCI_PRINTF(1,2);	/* Write error message and quit */
-  void (*warning)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a warning message */
-  void (*debug)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a debugging message */
-
-  struct pci_dev *devices;		/* Devices found on this bus */
-
-  /* Fields used internally: */
-  struct pci_methods *methods;
-  struct pci_param *params;
-  struct id_entry **id_hash;		/* names.c */
-  struct id_bucket *current_id_bucket;
-  int id_load_failed;
-  int id_cache_status;			/* 0=not read, 1=read, 2=dirty */
-  int fd;				/* proc/sys: fd for config space */
-  int fd_rw;				/* proc/sys: fd opened read-write */
-  int fd_pos;				/* proc/sys: current position */
-  int fd_vpd;				/* sys: fd for VPD */
-  struct pci_dev *cached_dev;		/* proc/sys: device the fds are for */
-};
-
-/* Initialize PCI access */
-struct pci_access *pci_alloc(void) PCI_ABI;
-void pci_init(struct pci_access *) PCI_ABI;
-void pci_cleanup(struct pci_access *) PCI_ABI;
-
-/* Scanning of devices */
-void pci_scan_bus(struct pci_access *acc) PCI_ABI;
-struct pci_dev *pci_get_dev(struct pci_access *acc, int domain, int bus, int dev, int func) PCI_ABI; /* Raw access to specified device */
-void pci_free_dev(struct pci_dev *) PCI_ABI;
-
-/* Names of access methods */
-int pci_lookup_method(char *name) PCI_ABI;	/* Returns -1 if not found */
-char *pci_get_method_name(int index) PCI_ABI;	/* Returns "" if unavailable, NULL if index out of range */
-
-/*
- *	Named parameters
- */
-
-struct pci_param {
-  struct pci_param *next;		/* Please use pci_walk_params() for traversing the list */
-  char *param;				/* Name of the parameter */
-  char *value;				/* Value of the parameter */
-  int value_malloced;			/* used internally */
-  char *help;				/* Explanation of the parameter */
-};
-
-char *pci_get_param(struct pci_access *acc, char *param) PCI_ABI;
-int pci_set_param(struct pci_access *acc, char *param, char *value) PCI_ABI;	/* 0 on success, -1 if no such parameter */
-/* To traverse the list, call pci_walk_params repeatedly, first with prev=NULL, and do not modify the parameters during traversal. */
-struct pci_param *pci_walk_params(struct pci_access *acc, struct pci_param *prev) PCI_ABI;
-
-/*
- *	Devices
- */
-
-struct pci_dev {
-  struct pci_dev *next;			/* Next device in the chain */
-  u16 domain;				/* PCI domain (host bridge) */
-  u8 bus, dev, func;			/* Bus inside domain, device and function */
-
-  /* These fields are set by pci_fill_info() */
-  int known_fields;			/* Set of info fields already known */
-  u16 vendor_id, device_id;		/* Identity of the device */
-  u16 device_class;			/* PCI device class */
-  int irq;				/* IRQ number */
-  pciaddr_t base_addr[6];		/* Base addresses including flags in lower bits */
-  pciaddr_t size[6];			/* Region sizes */
-  pciaddr_t rom_base_addr;		/* Expansion ROM base address */
-  pciaddr_t rom_size;			/* Expansion ROM size */
-  struct pci_cap *first_cap;		/* List of capabilities */
-  char *phy_slot;			/* Physical slot */
-
-  /* Fields used internally: */
-  struct pci_access *access;
-  struct pci_methods *methods;
-  u8 *cache;				/* Cached config registers */
-  int cache_len;
-  int hdrtype;				/* Cached low 7 bits of header type, -1 if unknown */
-  void *aux;				/* Auxillary data */
-};
-
-#define PCI_ADDR_IO_MASK (~(pciaddr_t) 0x3)
-#define PCI_ADDR_MEM_MASK (~(pciaddr_t) 0xf)
-#define PCI_ADDR_FLAG_MASK 0xf
-
-u8 pci_read_byte(struct pci_dev *, int pos) PCI_ABI; /* Access to configuration space */
-u16 pci_read_word(struct pci_dev *, int pos) PCI_ABI;
-u32 pci_read_long(struct pci_dev *, int pos) PCI_ABI;
-int pci_read_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
-int pci_read_vpd(struct pci_dev *d, int pos, u8 *buf, int len) PCI_ABI;
-int pci_write_byte(struct pci_dev *, int pos, u8 data) PCI_ABI;
-int pci_write_word(struct pci_dev *, int pos, u16 data) PCI_ABI;
-int pci_write_long(struct pci_dev *, int pos, u32 data) PCI_ABI;
-int pci_write_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
-
-int pci_fill_info(struct pci_dev *, int flags) PCI_ABI; /* Fill in device information */
-
-#define PCI_FILL_IDENT		1
-#define PCI_FILL_IRQ		2
-#define PCI_FILL_BASES		4
-#define PCI_FILL_ROM_BASE	8
-#define PCI_FILL_SIZES		16
-#define PCI_FILL_CLASS		32
-#define PCI_FILL_CAPS		64
-#define PCI_FILL_EXT_CAPS	128
-#define PCI_FILL_PHYS_SLOT	256
-#define PCI_FILL_RESCAN		0x10000
-
-void pci_setup_cache(struct pci_dev *, u8 *cache, int len) PCI_ABI;
-
-/*
- *	Capabilities
- */
-
-struct pci_cap {
-  struct pci_cap *next;
-  u16 id;				/* PCI_CAP_ID_xxx */
-  u16 type;				/* PCI_CAP_xxx */
-  unsigned int addr;			/* Position in the config space */
-};
-
-#define PCI_CAP_NORMAL		1	/* Traditional PCI capabilities */
-#define PCI_CAP_EXTENDED	2	/* PCIe extended capabilities */
-
-struct pci_cap *pci_find_cap(struct pci_dev *, unsigned int id, unsigned int type) PCI_ABI;
-
-/*
- *	Filters
- */
-
-struct pci_filter {
-  int domain, bus, slot, func;			/* -1 = ANY */
-  int vendor, device;
-};
-
-void pci_filter_init(struct pci_access *, struct pci_filter *) PCI_ABI;
-char *pci_filter_parse_slot(struct pci_filter *, char *) PCI_ABI;
-char *pci_filter_parse_id(struct pci_filter *, char *) PCI_ABI;
-int pci_filter_match(struct pci_filter *, struct pci_dev *) PCI_ABI;
-
-/*
- *	Conversion of PCI ID's to names (according to the pci.ids file)
- *
- *	Call pci_lookup_name() to identify different types of ID's:
- *
- *	VENDOR				(vendorID) -> vendor
- *	DEVICE				(vendorID, deviceID) -> device
- *	VENDOR | DEVICE			(vendorID, deviceID) -> combined vendor and device
- *	SUBSYSTEM | VENDOR		(subvendorID) -> subsystem vendor
- *	SUBSYSTEM | DEVICE		(vendorID, deviceID, subvendorID, subdevID) -> subsystem device
- *	SUBSYSTEM | VENDOR | DEVICE	(vendorID, deviceID, subvendorID, subdevID) -> combined subsystem v+d
- *	SUBSYSTEM | ...			(-1, -1, subvendorID, subdevID) -> generic subsystem
- *	CLASS				(classID) -> class
- *	PROGIF				(classID, progif) -> programming interface
- */
-
-char *pci_lookup_name(struct pci_access *a, char *buf, int size, int flags, ...) PCI_ABI;
-
-int pci_load_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_lookup_*() when needed; returns success */
-void pci_free_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_cleanup() */
-void pci_set_name_list_path(struct pci_access *a, char *name, int to_be_freed) PCI_ABI;
-void pci_id_cache_flush(struct pci_access *a) PCI_ABI;
-
-enum pci_lookup_mode {
-  PCI_LOOKUP_VENDOR = 1,		/* Vendor name (args: vendorID) */
-  PCI_LOOKUP_DEVICE = 2,		/* Device name (args: vendorID, deviceID) */
-  PCI_LOOKUP_CLASS = 4,			/* Device class (args: classID) */
-  PCI_LOOKUP_SUBSYSTEM = 8,
-  PCI_LOOKUP_PROGIF = 16,		/* Programming interface (args: classID, prog_if) */
-  PCI_LOOKUP_NUMERIC = 0x10000,		/* Want only formatted numbers; default if access->numeric_ids is set */
-  PCI_LOOKUP_NO_NUMBERS = 0x20000,	/* Return NULL if not found in the database; default is to print numerically */
-  PCI_LOOKUP_MIXED = 0x40000,		/* Include both numbers and names */
-  PCI_LOOKUP_NETWORK = 0x80000,		/* Try to resolve unknown ID's by DNS */
-  PCI_LOOKUP_SKIP_LOCAL = 0x100000,	/* Do not consult local database */
-  PCI_LOOKUP_CACHE = 0x200000,		/* Consult the local cache before using DNS */
-  PCI_LOOKUP_REFRESH_CACHE = 0x400000,	/* Forget all previously cached entries, but still allow updating the cache */
-};
-
-#endif
diff --git a/ext/hwloc/include/pci/types.h b/ext/hwloc/include/pci/types.h
deleted file mode 100644
index 4d23e692b..000000000
--- a/ext/hwloc/include/pci/types.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *	The PCI Library -- Types and Format Strings
- *
- *	Copyright (c) 1997--2008 Martin Mares <mj@ucw.cz>
- *
- *	Can be freely distributed and used under the terms of the GNU GPL.
- */
-
-#include <sys/types.h>
-
-#ifndef PCI_HAVE_Uxx_TYPES
-
-#ifdef PCI_OS_WINDOWS
-#include <windef.h>
-typedef BYTE u8;
-typedef WORD u16;
-typedef DWORD u32;
-#elif defined(PCI_HAVE_STDINT_H) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
-#include <stdint.h>
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-#else
-typedef u_int8_t u8;
-typedef u_int16_t u16;
-typedef u_int32_t u32;
-#endif
-
-#ifdef PCI_HAVE_64BIT_ADDRESS
-#include <limits.h>
-#if ULONG_MAX > 0xffffffff
-typedef unsigned long u64;
-#define PCI_U64_FMT "l"
-#else
-typedef unsigned long long u64;
-#define PCI_U64_FMT "ll"
-#endif
-#endif
-
-#endif	/* PCI_HAVE_Uxx_TYPES */
-
-#ifdef PCI_HAVE_64BIT_ADDRESS
-typedef u64 pciaddr_t;
-#define PCIADDR_T_FMT "%08" PCI_U64_FMT "x"
-#define PCIADDR_PORT_FMT "%04" PCI_U64_FMT "x"
-#else
-typedef u32 pciaddr_t;
-#define PCIADDR_T_FMT "%08x"
-#define PCIADDR_PORT_FMT "%04x"
-#endif
-
-#ifdef PCI_ARCH_SPARC64
-/* On sparc64 Linux the kernel reports remapped port addresses and IRQ numbers */
-#undef PCIADDR_PORT_FMT
-#define PCIADDR_PORT_FMT PCIADDR_T_FMT
-#define PCIIRQ_FMT "%08x"
-#else
-#define PCIIRQ_FMT "%d"
-#endif
-
-#if defined(__GNUC__) && __GNUC__ > 2
-#define PCI_PRINTF(x,y) __attribute__((format(printf, x, y)))
-#else
-#define PCI_PRINTF(x,y)
-#endif
diff --git a/ext/hwloc/include/private/autogen/config.h b/ext/hwloc/include/private/autogen/config.h
index 0cc9e51db..00e6d57d7 100644
--- a/ext/hwloc/include/private/autogen/config.h
+++ b/ext/hwloc/include/private/autogen/config.h
@@ -30,7 +30,8 @@
 
 /* Define to 1 if you have the `close' function. */
 #define HAVE_CLOSE 1
-
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
 /* Define to 1 if you have the `clz' function. */
 /* #undef HAVE_CLZ */
 
@@ -86,6 +87,14 @@
    don't. */
 #define HAVE_DECL_HW_NCPU 0
 
+/* Define to 1 if you have the declaration of `lgrp_latency_cookie', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_LGRP_LATENCY_COOKIE */
+
+/* Define to 1 if you have the declaration of `modff', and to 0 if you don't.
+   */
+#define HAVE_DECL_MODFF 1
+
 /* Define to 1 if you have the declaration of
    `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
 /* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
@@ -98,10 +107,25 @@
    0 if you don't. */
 #define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
 
+/* Embedded mode; just assume we do not have Valgrind support */
+#define HAVE_DECL_RUNNING_ON_VALGRIND 0
+
+/* Define to 1 if you have the declaration of `sched_getcpu', and to 0 if you
+   don't. */
+#define HAVE_DECL_SCHED_GETCPU 1
+
+/* Define to 1 if you have the declaration of `snprintf', and to 0 if you
+   don't. */
+#define HAVE_DECL_SNPRINTF 1
+
 /* Define to 1 if you have the declaration of `strtoull', and to 0 if you
    don't. */
 #define HAVE_DECL_STRTOULL 1
 
+/* Define to 1 if you have the declaration of `_putenv', and to 0 if you
+   don't. */
+#define HAVE_DECL__PUTENV 0
+
 /* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
    if you don't. */
 #define HAVE_DECL__SC_LARGE_PAGESIZE 0
@@ -130,6 +154,10 @@
    don't. */
 #define HAVE_DECL__SC_PAGE_SIZE 1
 
+/* Define to 1 if you have the declaration of `_strdup', and to 0 if you
+   don't. */
+#define HAVE_DECL__STRDUP 0
+
 /* Define to 1 if you have the <dirent.h> header file. */
 #define HAVE_DIRENT_H 1
 
@@ -238,7 +266,11 @@
 
 /* Define to 1 if you have the <myriexpress.h> header file. */
 /* #undef HAVE_MYRIEXPRESS_H */
+/* Define to 1 if you have the `mkstemp' function. */
+#define HAVE_MKSTEMP 1
 
+/* Define to 1 if you have the <mpi.h> header file. */
+/* #undef HAVE_MPI_H */
 /* Define to 1 if you have the `nl_langinfo' function. */
 #define HAVE_NL_LANGINFO 1
 
@@ -260,6 +292,9 @@
 /* Define to 1 if you have the `openat' function. */
 #define HAVE_OPENAT 1
 
+/* Define to 1 if you have the <OpenCL/cl_ext.h> header file. */
+/* #undef HAVE_OPENCL_CL_EXT_H */
+
 /* Define to 1 if you have the <picl.h> header file. */
 /* #undef HAVE_PICL_H */
 
@@ -272,6 +307,9 @@
 /* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
 /* #undef HAVE_PROCESSOR_GROUP_INFO */
 
+/* Define to 1 if the system has the type `PROCESSOR_NUMBER'. */
+/* #undef HAVE_PROCESSOR_NUMBER */
+
 /* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
 /* #undef HAVE_PROCESSOR_RELATIONSHIP */
 
@@ -310,12 +348,18 @@
 /* Define to 1 if you have the `setlocale' function. */
 #define HAVE_SETLOCALE 1
 
+/* Define to 1 if the system has the type `ssize_t'. */
+#define HAVE_SSIZE_T 1
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
+/* Define to 1 if you have the `strcasecmp' function. */
+#define HAVE_STRCASECMP 1
+
 /* Define to 1 if you have the `strftime' function. */
 #define HAVE_STRFTIME 1
 
@@ -334,6 +378,10 @@
 /* Define to 1 if gcc's __sync builtins are available */
 #define HAVE_SYNC_BUILTINS 1
 #endif
+
+/* Define to 1 if you have the `strtoull' function. */
+/* #undef HAVE_STRTOULL */
+
 /* Define to '1' if sysctl is present and usable */
 /* #undef HAVE_SYSCTL */
 
@@ -377,6 +425,9 @@
 /* Define to 1 if you have the <sys/utsname.h> header file. */
 #define HAVE_SYS_UTSNAME_H 1
 
+/* Define to 1 if you have the <time.h> header file. */
+#define HAVE_TIME_H 1
+
 /* Define to 1 if you have the `uname' function. */
 #define HAVE_UNAME 1
 
@@ -390,6 +441,9 @@
 /* Define to 1 if you have the `uselocale' function. */
 #define HAVE_USELOCALE 1
 
+/* Define to 1 if you have the <valgrind/valgrind.h> header file. */
+/* #undef HAVE_VALGRIND_VALGRIND_H */
+
 /* Define to 1 if the system has the type `wchar_t'. */
 #define HAVE_WCHAR_T 1
 
@@ -487,13 +541,16 @@
 /* #undef HWLOC_HAVE_BROKEN_FFS */
 
 /* Define to 1 if you have the `cairo' library. */
-#define HWLOC_HAVE_CAIRO 1
+/* #undef HWLOC_HAVE_CAIRO */
 
 /* Define to 1 if you have the `clz' function. */
 /* #undef HWLOC_HAVE_CLZ */
 
 /* Define to 1 if you have the `clzl' function. */
 /* #undef HWLOC_HAVE_CLZL */
+/* Define to 1 if snprintf supports NULL output buffer and returns the correct
+   length on truncation */
+#undef HWLOC_HAVE_CORRECT_SNPRINTF
 
 /* Define to 1 if the CPU_SET macro works */
 #define HWLOC_HAVE_CPU_SET 1
@@ -522,6 +579,9 @@
 /* Define to 1 if function `flsl' is declared by system headers */
 /* #undef HWLOC_HAVE_DECL_FLSL */
 
+/* Define to 1 if function `strcasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRCASECMP 1
+
 /* Define to 1 if function `strncasecmp' is declared by system headers */
 #define HWLOC_HAVE_DECL_STRNCASECMP 1
 
@@ -543,12 +603,19 @@
 /* Define to 1 if you have a library providing the termcap interface */
 /* #undef HWLOC_HAVE_LIBTERMCAP */
 
+/* Define to 1 if you have libudev. */
+/* #undef HWLOC_HAVE_LIBUDEV */
+
 /* Define to 1 if you have the `libxml2' library. */
 /* #undef HWLOC_HAVE_LIBXML2 */
 
 /* Define to 1 if building the Linux PCI component */
 #define HWLOC_HAVE_LINUXPCI 1
 
+/* Define to 1 if building the Linux I/O component */
+#define HWLOC_HAVE_LINUXIO 1
+
+
 /* Define to 1 if mbind is available. */
 /* #undef HWLOC_HAVE_MBIND */
 
@@ -584,12 +651,15 @@
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HWLOC_HAVE_STDINT_H 1
 
+/* Define to 1 if function `syscall' is available with 6 parameters */
+#define HWLOC_HAVE_SYSCALL 1
+
 /* Define to 1 if you have the `windows.h' header. */
 /* #undef HWLOC_HAVE_WINDOWS_H */
 
 /* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
 #define HWLOC_HAVE_X11_KEYSYM 1
-#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
+#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A) && !defined(_ARCH_PPC)
 /* Define to 1 if you have x86 cpuid */
 #define HWLOC_HAVE_X86_CPUID 1
 #endif
@@ -615,7 +685,7 @@
 /* The size of `unsigned int', as computed by sizeof */
 #define HWLOC_SIZEOF_UNSIGNED_INT 4
 /* The size of `unsigned long', as computed by sizeof */
-#ifdef __x86_64
+#if defined(__x86_64) || defined(_ARCH_PPC)
 #define HWLOC_SIZEOF_UNSIGNED_LONG 8
 #else
 #if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A)
@@ -636,14 +706,28 @@
 #define HWLOC_SYM_TRANSFORM 1
 
 /* Define to 1 on unsupported systems */
-/* #undef HWLOC_UNSUPPORTED_SYS */
+#ifdef _ARCH_PPC
+#define HWLOC_UNSUPPORTED_SYS 1
+#endif 
 
 /* Define to 1 if ncurses works, preferred over curses */
 /* #undef HWLOC_USE_NCURSES */
 
 /* The library version, always available, even in embedded mode, contrary to
    VERSION */
-#define HWLOC_VERSION "2.0.0a1-git"
+#define HWLOC_VERSION "2.1.0"
+
+/* The library version optional greek suffix string */
+#define HWLOC_VERSION_GREEK ""
+
+/* The library version major number */
+#define HWLOC_VERSION_MAJOR 2
+
+/* The library version minor number */
+#define HWLOC_VERSION_MINOR 1
+
+/* The library version release number */
+#define HWLOC_VERSION_RELEASE 0
 
 /* Define to 1 on WINDOWS */
 /* #undef HWLOC_WIN_SYS */
@@ -660,21 +744,23 @@
 #endif
 #endif
 
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
 #define LT_OBJDIR ".libs/"
 
+/* Define to 1 if scotch is netlocscotch is enabled */
+/* #undef NETLOC_SCOTCH */
+
 /* Name of package */
 #define PACKAGE "hwloc"
 
 /* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "http://www.open-mpi.org/projects/hwloc/"
+#define PACKAGE_BUGREPORT "http://github.com/open-mpi/hwloc/issues"
 
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "hwloc"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "hwloc 2.0.0a1-git"
+#define PACKAGE_STRING "hwloc 2.1.0"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "hwloc"
@@ -683,7 +769,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.0.0a1-git"
+#define PACKAGE_VERSION "2.1.0"
 
 /* The size of `unsigned int', as computed by sizeof. */
 #define SIZEOF_UNSIGNED_INT 4
@@ -742,10 +828,10 @@
 #endif
 
 /* Version number of package */
-#define VERSION "2.0.0a1-git"
+#define VERSION "2.1.0"
 
 /* Define to 1 if the X Window System is missing or not being used. */
-/* #undef X_DISPLAY_MISSING */
+#define X_DISPLAY_MISSING 1
 
 /* Are we building for HP-UX? */
 #define _HPUX_SOURCE 1
diff --git a/ext/hwloc/include/private/components.h b/ext/hwloc/include/private/components.h
index b36634535..e28c00b1d 100644
--- a/ext/hwloc/include/private/components.h
+++ b/ext/hwloc/include/private/components.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012 Inria.  All rights reserved.
+ * Copyright © 2012-2019 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -16,25 +16,30 @@
 #ifndef PRIVATE_COMPONENTS_H
 #define PRIVATE_COMPONENTS_H 1
 
-#include <hwloc/plugins.h>
+#include "hwloc/plugins.h"
 
 struct hwloc_topology;
 
 extern int hwloc_disc_component_force_enable(struct hwloc_topology *topology,
 					     int envvar_forced, /* 1 if forced through envvar, 0 if forced through API */
-					     int type, const char *name,
+					     const char *name,
 					     const void *data1, const void *data2, const void *data3);
 extern void hwloc_disc_components_enable_others(struct hwloc_topology *topology);
 
-/* Compute the topology is_thissystem flag based on enabled backends */
+/* Compute the topology is_thissystem flag and find some callbacks based on enabled backends */
 extern void hwloc_backends_is_thissystem(struct hwloc_topology *topology);
+extern void hwloc_backends_find_callbacks(struct hwloc_topology *topology);
 
+/* Initialize the lists of components and backends used by a topology */
+extern void hwloc_topology_components_init(struct hwloc_topology *topology);
 /* Disable and destroy all backends used by a topology */
 extern void hwloc_backends_disable_all(struct hwloc_topology *topology);
+/* Cleanup the lists of components used by a topology */
+extern void hwloc_topology_components_fini(struct hwloc_topology *topology);
 
 /* Used by the core to setup/destroy the list of components */
-extern void hwloc_components_init(struct hwloc_topology *topology); /* increases components refcount, should be called exactly once per topology (during init) */
-extern void hwloc_components_destroy_all(struct hwloc_topology *topology); /* decreases components refcount, should be called exactly once per topology (during destroy) */
+extern void hwloc_components_init(void); /* increases components refcount, should be called exactly once per topology (during init) */
+extern void hwloc_components_fini(void); /* decreases components refcount, should be called exactly once per topology (during destroy) */
 
 #endif /* PRIVATE_COMPONENTS_H */
 
diff --git a/ext/hwloc/include/private/cpuid-x86.h b/ext/hwloc/include/private/cpuid-x86.h
index 8a8c48e5d..2758afe04 100644
--- a/ext/hwloc/include/private/cpuid-x86.h
+++ b/ext/hwloc/include/private/cpuid-x86.h
@@ -72,14 +72,11 @@ static __hwloc_inline void hwloc_x86_cpuid(unsigned *eax, unsigned *ebx, unsigne
   : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
     "+c" (*ecx), "=&d" (*edx));
 #elif defined(HWLOC_X86_32_ARCH)
-  unsigned long sav_ebx;
   __asm__(
-  "mov %%ebx,%2\n\t"
+  "mov %%ebx,%1\n\t"
   "cpuid\n\t"
-  "xchg %2,%%ebx\n\t"
-  "movl %k2,%1\n\t"
-  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
-    "+c" (*ecx), "=&d" (*edx));
+  "xchg %%ebx,%1\n\t"
+  : "+a" (*eax), "=&SD" (*ebx), "+c" (*ecx), "=&d" (*edx));
 #else
 #error unknown architecture
 #endif
diff --git a/ext/hwloc/include/private/debug.h b/ext/hwloc/include/private/debug.h
index 4de91bf8a..637e0141e 100644
--- a/ext/hwloc/include/private/debug.h
+++ b/ext/hwloc/include/private/debug.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2012 Inria.  All rights reserved.
+ * Copyright © 2009-2017 Inria.  All rights reserved.
  * Copyright © 2009, 2011 Université Bordeaux
  * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -11,43 +11,69 @@
 #ifndef HWLOC_DEBUG_H
 #define HWLOC_DEBUG_H
 
-#include <private/autogen/config.h>
+#include "private/autogen/config.h"
+#include "private/misc.h"
 
 #ifdef HWLOC_DEBUG
 #include <stdarg.h>
 #include <stdio.h>
 #endif
 
+/* Compile-time assertion */
+#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
+
+#ifdef HWLOC_DEBUG
+static __hwloc_inline int hwloc_debug_enabled(void)
+{
+  static int checked = 0;
+  static int enabled = 1;
+  if (!checked) {
+    const char *env = getenv("HWLOC_DEBUG_VERBOSE");
+    if (env)
+      enabled = atoi(env);
+    if (enabled)
+      fprintf(stderr, "hwloc verbose debug enabled, may be disabled with HWLOC_DEBUG_VERBOSE=0 in the environment.\n");
+    checked = 1;
+  }
+  return enabled;
+}
+#endif
+
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...) __hwloc_attribute_format(printf, 1, 2);
 static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...)
 {
 #ifdef HWLOC_DEBUG
+  if (hwloc_debug_enabled()) {
     va_list ap;
-
     va_start(ap, s);
     vfprintf(stderr, s, ap);
     va_end(ap);
+  }
 #endif
 }
 
 #ifdef HWLOC_DEBUG
 #define hwloc_debug_bitmap(fmt, bitmap) do { \
+if (hwloc_debug_enabled()) { \
   char *s; \
   hwloc_bitmap_asprintf(&s, bitmap); \
   fprintf(stderr, fmt, s); \
   free(s); \
-} while (0)
+} } while (0)
 #define hwloc_debug_1arg_bitmap(fmt, arg1, bitmap) do { \
+if (hwloc_debug_enabled()) { \
   char *s; \
   hwloc_bitmap_asprintf(&s, bitmap); \
   fprintf(stderr, fmt, arg1, s); \
   free(s); \
-} while (0)
+} } while (0)
 #define hwloc_debug_2args_bitmap(fmt, arg1, arg2, bitmap) do { \
+if (hwloc_debug_enabled()) { \
   char *s; \
   hwloc_bitmap_asprintf(&s, bitmap); \
   fprintf(stderr, fmt, arg1, arg2, s); \
   free(s); \
-} while (0)
+} } while (0)
 #else
 #define hwloc_debug_bitmap(s, bitmap) do { } while(0)
 #define hwloc_debug_1arg_bitmap(s, arg1, bitmap) do { } while(0)
diff --git a/ext/hwloc/include/private/misc.h b/ext/hwloc/include/private/misc.h
index d0e6a465f..6c02d793b 100644
--- a/ext/hwloc/include/private/misc.h
+++ b/ext/hwloc/include/private/misc.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -11,8 +11,9 @@
 #ifndef HWLOC_PRIVATE_MISC_H
 #define HWLOC_PRIVATE_MISC_H
 
-#include <hwloc/autogen/config.h>
-#include <private/autogen/config.h>
+#include "hwloc/autogen/config.h"
+#include "private/autogen/config.h"
+#include "hwloc.h"
 
 #ifdef HWLOC_HAVE_DECL_STRNCASECMP
 #ifdef HAVE_STRINGS_H
@@ -24,9 +25,6 @@
 #endif
 #endif
 
-/* Compile-time assertion */
-#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
-
 #define HWLOC_BITS_PER_LONG (HWLOC_SIZEOF_UNSIGNED_LONG * 8)
 #define HWLOC_BITS_PER_INT (HWLOC_SIZEOF_UNSIGNED_INT * 8)
 
@@ -38,6 +36,8 @@
 #error "unknown size for unsigned int."
 #endif
 
+/* internal-use-only value for when we don't know the type or don't have any value */
+#define HWLOC_OBJ_TYPE_NONE ((hwloc_obj_type_t) -1)
 
 /**
  * ffsl helpers.
@@ -189,9 +189,9 @@ hwloc_ffsl_from_ffs32(unsigned long x)
 #ifdef __GNUC_____
 
 #  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
-#    define hwloc_flsl(x) (x ? 8*sizeof(long) - __builtin_clzl(x) : 0)
+#    define hwloc_flsl(x) ((x) ? (8*sizeof(long) - __builtin_clzl(x)) : 0)
 #  else
-#    define hwloc_fls(x) (x ? 8*sizeof(int) - __builtin_clz(x) : 0)
+#    define hwloc_fls(x) ((x) ? (8*sizeof(int) - __builtin_clz(x)) : 0)
 #    define HWLOC_NEED_FLSL
 #  endif
 
@@ -209,7 +209,7 @@ extern int flsl(long) __hwloc_attribute_const;
 extern int clzl(long) __hwloc_attribute_const;
 #  endif
 
-#  define hwloc_flsl(x) (x ? 8*sizeof(long) - clzl(x) : 0)
+#  define hwloc_flsl(x) ((x) ? (8*sizeof(long) - clzl(x)) : 0)
 
 #elif defined(HWLOC_HAVE_FLS)
 
@@ -226,7 +226,7 @@ extern int fls(int) __hwloc_attribute_const;
 extern int clz(int) __hwloc_attribute_const;
 #  endif
 
-#  define hwloc_fls(x) (x ? 8*sizeof(int) - clz(x) : 0)
+#  define hwloc_fls(x) ((x) ? (8*sizeof(int) - clz(x)) : 0)
 #  define HWLOC_NEED_FLSL
 
 #else /* no fls implementation */
@@ -360,7 +360,7 @@ hwloc_weight_long(unsigned long w)
 #endif /* HWLOC_BITS_PER_LONG == 64 */
 }
 
-#if !HAVE_DECL_STRTOULL
+#if !HAVE_DECL_STRTOULL && defined(HAVE_STRTOULL)
 unsigned long long int strtoull(const char *nptr, char **endptr, int base);
 #endif
 
@@ -379,4 +379,198 @@ static __hwloc_inline int hwloc_strncasecmp(const char *s1, const char *s2, size
 #endif
 }
 
+static __hwloc_inline hwloc_obj_type_t hwloc_cache_type_by_depth_type(unsigned depth, hwloc_obj_cache_type_t type)
+{
+  if (type == HWLOC_OBJ_CACHE_INSTRUCTION) {
+    if (depth >= 1 && depth <= 3)
+      return HWLOC_OBJ_L1ICACHE + depth-1;
+    else
+      return HWLOC_OBJ_TYPE_NONE;
+  } else {
+    if (depth >= 1 && depth <= 5)
+      return HWLOC_OBJ_L1CACHE + depth-1;
+    else
+      return HWLOC_OBJ_TYPE_NONE;
+  }
+}
+
+#define HWLOC_BITMAP_EQUAL 0       /* Bitmaps are equal */
+#define HWLOC_BITMAP_INCLUDED 1    /* First bitmap included in second */
+#define HWLOC_BITMAP_CONTAINS 2    /* First bitmap contains second */
+#define HWLOC_BITMAP_INTERSECTS 3  /* Bitmaps intersect without any inclusion */
+#define HWLOC_BITMAP_DIFFERENT  4  /* Bitmaps do not intersect */
+
+/* Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view. */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/* Return a stringified PCI class. */
+HWLOC_DECLSPEC extern const char * hwloc_pci_class_string(unsigned short class_id);
+
+/* Parse a PCI link speed (GT/s) string from Linux sysfs */
+#ifdef HWLOC_LINUX_SYS
+#include <stdlib.h> /* for atof() */
+static __hwloc_inline float
+hwloc_linux_pci_link_speed_from_string(const char *string)
+{
+  /* don't parse Gen1 with atof() since it expects a localized string
+   * while the kernel sysfs files aren't.
+   */
+  if (!strncmp(string, "2.5 ", 4))
+    /* "2.5 GT/s" is Gen1 with 8/10 encoding */
+    return 2.5 * .8;
+
+  /* also hardwire Gen2 since it also has a specific encoding */
+  if (!strncmp(string, "5 ", 2))
+    /* "5 GT/s" is Gen2 with 8/10 encoding */
+    return 5 * .8;
+
+  /* handle Gen3+ in a generic way */
+  return atof(string) * 128./130; /* Gen3+ encoding is 128/130 */
+}
+#endif
+
+/* Traverse children of a parent */
+#define for_each_child(child, parent) for(child = parent->first_child; child; child = child->next_sibling)
+#define for_each_memory_child(child, parent) for(child = parent->memory_first_child; child; child = child->next_sibling)
+#define for_each_io_child(child, parent) for(child = parent->io_first_child; child; child = child->next_sibling)
+#define for_each_misc_child(child, parent) for(child = parent->misc_first_child; child; child = child->next_sibling)
+
+/* Any object attached to normal children */
+static __hwloc_inline int hwloc__obj_type_is_normal (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type <= HWLOC_OBJ_GROUP || type == HWLOC_OBJ_DIE;
+}
+
+/* Any object attached to memory children, currently NUMA nodes or Memory-side caches */
+static __hwloc_inline int hwloc__obj_type_is_memory (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MEMCACHE;
+}
+
+/* I/O or Misc object, without cpusets or nodesets. */
+static __hwloc_inline int hwloc__obj_type_is_special (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_MISC;
+}
+
+/* Any object attached to io children */
+static __hwloc_inline int hwloc__obj_type_is_io (hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
+}
+
+/* Any CPU caches (not Memory-side caches) */
+static __hwloc_inline int
+hwloc__obj_type_is_cache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1CACHE && type <= HWLOC_OBJ_L3ICACHE);
+}
+
+static __hwloc_inline int
+hwloc__obj_type_is_dcache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1CACHE && type <= HWLOC_OBJ_L5CACHE);
+}
+
+/** \brief Check whether an object is a Instruction Cache. */
+static __hwloc_inline int
+hwloc__obj_type_is_icache(hwloc_obj_type_t type)
+{
+  /* type contiguity is asserted in topology_check() */
+  return (type >= HWLOC_OBJ_L1ICACHE && type <= HWLOC_OBJ_L3ICACHE);
+}
+
+#ifdef HAVE_USELOCALE
+#include "locale.h"
+#ifdef HAVE_XLOCALE_H
+#include "xlocale.h"
+#endif
+#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
+#define hwloc_localeswitch_init() do {                     \
+  __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
+  if (__new_locale != (locale_t)0)                         \
+    __old_locale = uselocale(__new_locale);                \
+} while (0)
+#define hwloc_localeswitch_fini() do { \
+  if (__new_locale != (locale_t)0) {   \
+    uselocale(__old_locale);           \
+    freelocale(__new_locale);          \
+  }                                    \
+} while(0)
+#else /* HAVE_USELOCALE */
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
+#define hwloc_localeswitch_init()
+#else
+#define hwloc_localeswitch_declare int __dummy_nolocale
+#define hwloc_localeswitch_init() (void)__dummy_nolocale
+#endif
+#define hwloc_localeswitch_fini()
+#endif /* HAVE_USELOCALE */
+
+#if !HAVE_DECL_FABSF
+#define fabsf(f) fabs((double)(f))
+#endif
+
+#if !HAVE_DECL_MODFF
+#define modff(x,iptr) (float)modf((double)x,(double *)iptr)
+#endif
+
+#if HAVE_DECL__SC_PAGE_SIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
+#elif HAVE_DECL__SC_PAGESIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
+#elif defined HAVE_GETPAGESIZE
+#define hwloc_getpagesize() getpagesize()
+#else
+#undef hwloc_getpagesize
+#endif
+
+#if HWLOC_HAVE_ATTRIBUTE_FORMAT
+#  define __hwloc_attribute_format(type, str, arg)  __attribute__((__format__(type, str, arg)))
+#else
+#  define __hwloc_attribute_format(type, str, arg)
+#endif
+
+#define hwloc_memory_size_printf_value(_size, _verbose) \
+  ((_size) < (10ULL<<20) || (_verbose) ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
+#define hwloc_memory_size_printf_unit(_size, _verbose) \
+  ((_size) < (10ULL<<20) || (_verbose) ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
+
+#ifdef HWLOC_WIN_SYS
+#  ifndef HAVE_SSIZE_T
+typedef SSIZE_T ssize_t;
+#  endif
+#  if !HAVE_DECL_STRTOULL && !defined(HAVE_STRTOULL)
+#    define strtoull _strtoui64
+#  endif
+#  ifndef S_ISREG
+#    define S_ISREG(m) ((m) & S_IFREG)
+#  endif
+#  ifndef S_ISDIR
+#    define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
+#  endif
+#  ifndef S_IRWXU
+#    define S_IRWXU 00700
+#  endif
+#  ifndef HWLOC_HAVE_DECL_STRCASECMP
+#    define strcasecmp _stricmp
+#  endif
+#  if !HAVE_DECL_SNPRINTF
+#    define snprintf _snprintf
+#  endif
+#  if HAVE_DECL__STRDUP
+#    define strdup _strdup
+#  endif
+#  if HAVE_DECL__PUTENV
+#    define putenv _putenv
+#  endif
+#endif
+
 #endif /* HWLOC_PRIVATE_MISC_H */
diff --git a/ext/hwloc/include/private/private.h b/ext/hwloc/include/private/private.h
index fa344ace9..5f8789376 100644
--- a/ext/hwloc/include/private/private.h
+++ b/ext/hwloc/include/private/private.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009      CNRS
- * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2019 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  *
@@ -22,11 +22,12 @@
 #ifndef HWLOC_PRIVATE_H
 #define HWLOC_PRIVATE_H
 
-#include <private/autogen/config.h>
-#include <hwloc.h>
-#include <hwloc/bitmap.h>
-#include <private/components.h>
-#include <private/debug.h>
+#include "private/autogen/config.h"
+#include "hwloc.h"
+#include "hwloc/bitmap.h"
+#include "private/components.h"
+#include "private/misc.h"
+
 #include <sys/types.h>
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -39,42 +40,55 @@
 #endif
 #include <string.h>
 
-enum hwloc_ignore_type_e {
-  HWLOC_IGNORE_TYPE_NEVER = 0,
-  HWLOC_IGNORE_TYPE_KEEP_STRUCTURE,
-  HWLOC_IGNORE_TYPE_ALWAYS
-};
+#define HWLOC_TOPOLOGY_ABI 0x20100 /* version of the layout of struct topology */
 
-#define HWLOC_DEPTH_MAX 128
+/*****************************************************
+ * WARNING:
+ * changes below in this structure (and its children)
+ * should cause a bump of HWLOC_TOPOLOGY_ABI.
+ *****************************************************/
 
 struct hwloc_topology {
+  unsigned topology_abi;
+
   unsigned nb_levels;					/* Number of horizontal levels */
-  unsigned next_group_depth;				/* Depth of the next Group object that we may create */
-  unsigned level_nbobjects[HWLOC_DEPTH_MAX]; 		/* Number of objects on each horizontal level */
-  struct hwloc_obj **levels[HWLOC_DEPTH_MAX];		/* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
+  unsigned nb_levels_allocated;				/* Number of levels allocated and zeroed in level_nbobjects and levels below */
+  unsigned *level_nbobjects; 				/* Number of objects on each horizontal level */
+  struct hwloc_obj ***levels;				/* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
   unsigned long flags;
   int type_depth[HWLOC_OBJ_TYPE_MAX];
-  enum hwloc_ignore_type_e ignored_types[HWLOC_OBJ_TYPE_MAX];
+  enum hwloc_type_filter_e type_filter[HWLOC_OBJ_TYPE_MAX];
   int is_thissystem;
   int is_loaded;
   int modified;                                         /* >0 if objects were added/removed recently, which means a reconnect is needed */
   hwloc_pid_t pid;                                      /* Process ID the topology is view from, 0 for self */
   void *userdata;
-
-  unsigned bridge_nbobjects;
-  struct hwloc_obj **bridge_level;
-  struct hwloc_obj *first_bridge, *last_bridge;
-  unsigned pcidev_nbobjects;
-  struct hwloc_obj **pcidev_level;
-  struct hwloc_obj *first_pcidev, *last_pcidev;
-  unsigned osdev_nbobjects;
-  struct hwloc_obj **osdev_level;
-  struct hwloc_obj *first_osdev, *last_osdev;
-  unsigned misc_nbobjects;
-  struct hwloc_obj **misc_level;
-  struct hwloc_obj *first_misc, *last_misc;
+  uint64_t next_gp_index;
+
+  void *adopted_shmem_addr;
+  size_t adopted_shmem_length;
+
+#define HWLOC_NR_SLEVELS 6
+#define HWLOC_SLEVEL_NUMANODE 0
+#define HWLOC_SLEVEL_BRIDGE 1
+#define HWLOC_SLEVEL_PCIDEV 2
+#define HWLOC_SLEVEL_OSDEV 3
+#define HWLOC_SLEVEL_MISC 4
+#define HWLOC_SLEVEL_MEMCACHE 5
+  /* order must match negative depth, it's asserted in setup_defaults() */
+#define HWLOC_SLEVEL_FROM_DEPTH(x) (HWLOC_TYPE_DEPTH_NUMANODE-(x))
+#define HWLOC_SLEVEL_TO_DEPTH(x) (HWLOC_TYPE_DEPTH_NUMANODE-(x))
+  struct hwloc_special_level_s {
+    unsigned nbobjs;
+    struct hwloc_obj **objs;
+    struct hwloc_obj *first, *last; /* Temporarily used while listing object before building the objs array */
+  } slevels[HWLOC_NR_SLEVELS];
+
+  hwloc_bitmap_t allowed_cpuset;
+  hwloc_bitmap_t allowed_nodeset;
 
   struct hwloc_binding_hooks {
+    /* These are actually rather OS hooks since some of them are not about binding */
     int (*set_thisproc_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
     int (*get_thisproc_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
     int (*set_thisthread_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
@@ -98,48 +112,123 @@ struct hwloc_topology {
     int (*get_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
     int (*set_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
     int (*get_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*get_area_memlocation)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags);
     /* This has to return the same kind of pointer as alloc_membind, so that free_membind can be used on it */
     void *(*alloc)(hwloc_topology_t topology, size_t len);
     /* alloc_membind has to always succeed if !(flags & HWLOC_MEMBIND_STRICT).
      * see hwloc_alloc_or_fail which is convenient for that.  */
     void *(*alloc_membind)(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
     int (*free_membind)(hwloc_topology_t topology, void *addr, size_t len);
+
+    int (*get_allowed_resources)(hwloc_topology_t topology);
   } binding_hooks;
 
   struct hwloc_topology_support support;
 
   void (*userdata_export_cb)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj);
   void (*userdata_import_cb)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length);
+  int userdata_not_decoded;
+
+  struct hwloc_internal_distances_s {
+    char *name; /* FIXME: needs an API to set it from user */
+
+    unsigned id; /* to match the container id field of public distances structure
+		  * not exported to XML, regenerated during _add()
+		  */
 
-  struct hwloc_os_distances_s {
-    hwloc_obj_type_t type;
-    int nbobjs;
-    unsigned *indexes; /* array of OS indexes before we can convert them into objs. always available.
+    /* if all objects have the same type, different_types is NULL and unique_type is valid.
+     * otherwise unique_type is HWLOC_OBJ_TYPE_NONE and different_types contains individual objects types.
+     */
+    hwloc_obj_type_t unique_type;
+    hwloc_obj_type_t *different_types;
+
+    /* add union hwloc_obj_attr_u if we ever support groups */
+    unsigned nbobjs;
+    uint64_t *indexes; /* array of OS or GP indexes before we can convert them into objs.
+			* OS indexes for distances covering only PUs or only NUMAnodes.
 			*/
-    struct hwloc_obj **objs; /* array of objects, in the same order as above.
-			      * either given (by a backend) together with the indexes array above.
-			      * or build from the above indexes array when not given (by the user).
-			      */
-    float *distances; /* distance matrices, ordered according to the above indexes/objs array.
+#define HWLOC_DIST_TYPE_USE_OS_INDEX(_type) ((_type) == HWLOC_OBJ_PU || (_type == HWLOC_OBJ_NUMANODE))
+    uint64_t *values; /* distance matrices, ordered according to the above indexes/objs array.
 		       * distance from i to j is stored in slot i*nbnodes+j.
-		       * will be copied into the main logical-index-ordered distance at the end of the discovery.
 		       */
-    int forced; /* set if the user forced a matrix to ignore the OS one */
+    unsigned long kind;
+
+#define HWLOC_INTERNAL_DIST_FLAG_OBJS_VALID (1U<<0) /* if the objs array is valid below */
+    unsigned iflags;
+
+    /* objects are currently stored in physical_index order */
+    hwloc_obj_t *objs; /* array of objects */
 
-    struct hwloc_os_distances_s *prev, *next;
-  } *first_osdist, *last_osdist;
+    struct hwloc_internal_distances_s *prev, *next;
+  } *first_dist, *last_dist;
+  unsigned next_dist_id;
+
+  int grouping;
+  int grouping_verbose;
+  unsigned grouping_nbaccuracies;
+  float grouping_accuracies[5];
+  unsigned grouping_next_subkind;
 
   /* list of enabled backends. */
   struct hwloc_backend * backends;
+  struct hwloc_backend * get_pci_busid_cpuset_backend; /* first backend that provides get_pci_busid_cpuset() callback */
+  unsigned backend_phases;
+  unsigned backend_excluded_phases;
+
+  /* memory allocator for topology objects */
+  struct hwloc_tma * tma;
+
+/*****************************************************
+ * WARNING:
+ * changes above in this structure (and its children)
+ * should cause a bump of HWLOC_TOPOLOGY_ABI.
+ *****************************************************/
+
+  /*
+   * temporary variables during discovery
+   */
+
+  /* machine-wide memory.
+   * temporarily stored there by OSes that only provide this without NUMA information,
+   * and actually used later by the core.
+   */
+  struct hwloc_numanode_attr_s machine_memory;
+
+  /* pci stuff */
+  int pci_has_forced_locality;
+  unsigned pci_forced_locality_nr;
+  struct hwloc_pci_forced_locality_s {
+    unsigned domain;
+    unsigned bus_first, bus_last;
+    hwloc_bitmap_t cpuset;
+  } * pci_forced_locality;
+
+  /* component blacklisting */
+  unsigned nr_blacklisted_components;
+  struct hwloc_topology_forced_component_s {
+    struct hwloc_disc_component *component;
+    unsigned phases;
+  } *blacklisted_components;
+
+  /* FIXME: keep until topo destroy and reuse for finding specific buses */
+  struct hwloc_pci_locality_s {
+    unsigned domain;
+    unsigned bus_min;
+    unsigned bus_max;
+    hwloc_bitmap_t cpuset;
+    hwloc_obj_t parent;
+    struct hwloc_pci_locality_s *prev, *next;
+  } *first_pci_locality, *last_pci_locality;
 };
 
-extern void hwloc_alloc_obj_cpusets(hwloc_obj_t obj);
+extern void hwloc_alloc_root_sets(hwloc_obj_t root);
 extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
 extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
 extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
-extern unsigned hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
-extern void hwloc_connect_children(hwloc_obj_t obj);
-extern int hwloc_connect_levels(hwloc_topology_t topology);
+
+/* returns the number of CPU from the OS (only valid if thissystem) */
+#define HWLOC_FALLBACK_NBPROCESSORS_INCLUDE_OFFLINE 1 /* by default we try to get only the online CPUs */
+extern int hwloc_fallback_nbprocessors(unsigned flags);
 
 extern int hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2);
 extern void hwloc__reorder_children(hwloc_obj_t parent);
@@ -147,10 +236,30 @@ extern void hwloc__reorder_children(hwloc_obj_t parent);
 extern void hwloc_topology_setup_defaults(struct hwloc_topology *topology);
 extern void hwloc_topology_clear(struct hwloc_topology *topology);
 
-extern void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value);
-extern char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name);
-extern void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_obj_info_s **src_infosp, unsigned *src_countp);
-extern void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count);
+/* insert memory object as memory child of normal parent */
+extern struct hwloc_obj * hwloc__attach_memory_object(struct hwloc_topology *topology, hwloc_obj_t parent,
+						      hwloc_obj_t obj,
+						      hwloc_report_error_t report_error);
+
+extern void hwloc_pci_discovery_init(struct hwloc_topology *topology);
+extern void hwloc_pci_discovery_prepare(struct hwloc_topology *topology);
+extern void hwloc_pci_discovery_exit(struct hwloc_topology *topology);
+
+/* Look for an object matching the given domain/bus/func,
+ * either exactly or return the smallest container bridge
+ */
+extern struct hwloc_obj * hwloc_pci_find_by_busid(struct hwloc_topology *topology, unsigned domain, unsigned bus, unsigned dev, unsigned func);
+
+/* Look for an object matching complete cpuset exactly, or insert one.
+ * Return NULL on failure.
+ * Return a good fallback (object above) on failure to insert.
+ */
+extern hwloc_obj_t hwloc_find_insert_io_parent_by_complete_cpuset(struct hwloc_topology *topology, hwloc_cpuset_t cpuset);
+
+extern int hwloc__add_info(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value);
+extern int hwloc__add_info_nodup(struct hwloc_info_s **infosp, unsigned *countp, const char *name, const char *value, int replace);
+extern int hwloc__move_infos(struct hwloc_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_info_s **src_infosp, unsigned *src_countp);
+extern void hwloc__free_infos(struct hwloc_info_s *infos, unsigned count);
 
 /* set native OS binding hooks */
 extern void hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support);
@@ -173,10 +282,6 @@ extern void hwloc_set_solaris_hooks(struct hwloc_binding_hooks *binding_hooks, s
 extern void hwloc_set_aix_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
 #endif /* HWLOC_AIX_SYS */
 
-#ifdef HWLOC_OSF_SYS
-extern void hwloc_set_osf_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
-#endif /* HWLOC_OSF_SYS */
-
 #ifdef HWLOC_WIN_SYS
 extern void hwloc_set_windows_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
 #endif /* HWLOC_WIN_SYS */
@@ -197,17 +302,24 @@ extern void hwloc_set_netbsd_hooks(struct hwloc_binding_hooks *binding_hooks, st
 extern void hwloc_set_hpux_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
 #endif /* HWLOC_HPUX_SYS */
 
+extern int hwloc_look_hardwired_fujitsu_k(struct hwloc_topology *topology);
+extern int hwloc_look_hardwired_fujitsu_fx10(struct hwloc_topology *topology);
+extern int hwloc_look_hardwired_fujitsu_fx100(struct hwloc_topology *topology);
+
 /* Insert uname-specific names/values in the object infos array.
  * If cached_uname isn't NULL, it is used as a struct utsname instead of recalling uname.
  * Any field that starts with \0 is ignored.
  */
 extern void hwloc_add_uname_info(struct hwloc_topology *topology, void *cached_uname);
 
-/* Free obj and its attributes assuming it doesn't have any children/parent anymore */
+/* Free obj and its attributes assuming it's not linked to a parent and doesn't have any child */
 extern void hwloc_free_unlinked_object(hwloc_obj_t obj);
 
-/* Duplicate src and its children under newparent in newtopology */
-extern void hwloc__duplicate_objects(struct hwloc_topology *newtopology, struct hwloc_obj *newparent, struct hwloc_obj *src);
+/* Free obj and its children, assuming it's not linked to a parent */
+extern void hwloc_free_object_and_children(hwloc_obj_t obj);
+
+/* Free obj, its next siblings, and their children, assuming they're not linked to a parent */
+extern void hwloc_free_object_siblings_and_children(hwloc_obj_t obj);
 
 /* This can be used for the alloc field to get allocated data that can be freed by free() */
 void *hwloc_alloc_heap(hwloc_topology_t topology, size_t len);
@@ -231,54 +343,14 @@ hwloc_alloc_or_fail(hwloc_topology_t topology, size_t len, int flags)
   return hwloc_alloc(topology, len);
 }
 
-extern void hwloc_distances_init(struct hwloc_topology *topology);
-extern void hwloc_distances_destroy(struct hwloc_topology *topology);
-extern void hwloc_distances_set(struct hwloc_topology *topology, hwloc_obj_type_t type, unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances, int force);
-extern void hwloc_distances_set_from_env(struct hwloc_topology *topology);
-extern void hwloc_distances_restrict_os(struct hwloc_topology *topology);
-extern void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags);
-extern void hwloc_distances_finalize_os(struct hwloc_topology *topology);
-extern void hwloc_distances_finalize_logical(struct hwloc_topology *topology);
-extern void hwloc_clear_object_distances(struct hwloc_obj *obj);
-extern void hwloc_clear_object_distances_one(struct hwloc_distances_s *distances);
-extern void hwloc_group_by_distances(struct hwloc_topology *topology);
-
-#ifdef HAVE_USELOCALE
-#include "locale.h"
-#ifdef HAVE_XLOCALE_H
-#include "xlocale.h"
-#endif
-#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
-#define hwloc_localeswitch_init() do {                     \
-  __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
-  if (__new_locale != (locale_t)0)                         \
-    __old_locale = uselocale(__new_locale);                \
-} while (0)
-#define hwloc_localeswitch_fini() do { \
-  if (__new_locale != (locale_t)0) {   \
-    uselocale(__old_locale);           \
-    freelocale(__new_locale);          \
-  }                                    \
-} while(0)
-#else /* HAVE_USELOCALE */
-#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
-#define hwloc_localeswitch_init()
-#define hwloc_localeswitch_fini()
-#endif /* HAVE_USELOCALE */
-
-#if !HAVE_DECL_FABSF
-#define fabsf(f) fabs((double)(f))
-#endif
-
-#if HAVE_DECL__SC_PAGE_SIZE
-#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
-#elif HAVE_DECL__SC_PAGESIZE
-#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
-#elif defined HAVE_GETPAGESIZE
-#define hwloc_getpagesize() getpagesize()
-#else
-#undef hwloc_getpagesize
-#endif
+extern void hwloc_internal_distances_init(hwloc_topology_t topology);
+extern void hwloc_internal_distances_prepare(hwloc_topology_t topology);
+extern void hwloc_internal_distances_destroy(hwloc_topology_t topology);
+extern int hwloc_internal_distances_dup(hwloc_topology_t new, hwloc_topology_t old);
+extern void hwloc_internal_distances_refresh(hwloc_topology_t topology);
+extern int hwloc_internal_distances_add(hwloc_topology_t topology, const char *name, unsigned nbobjs, hwloc_obj_t *objs, uint64_t *values, unsigned long kind, unsigned long flags);
+extern int hwloc_internal_distances_add_by_index(hwloc_topology_t topology, const char *name, hwloc_obj_type_t unique_type, hwloc_obj_type_t *different_types, unsigned nbobjs, uint64_t *indexes, uint64_t *values, unsigned long kind, unsigned long flags);
+extern void hwloc_internal_distances_invalidate_cached_objs(hwloc_topology_t topology);
 
 /* encode src buffer into target buffer.
  * targsize must be at least 4*((srclength+2)/3)+1.
@@ -293,43 +365,94 @@ extern int hwloc_encode_to_base64(const char *src, size_t srclength, char *targe
  */
 extern int hwloc_decode_from_base64(char const *src, char *target, size_t targsize);
 
-/* Check whether needle matches the beginning of haystack, at least n, and up
- * to a colon or \0 */
-extern int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n);
-
-#ifdef HWLOC_HAVE_ATTRIBUTE_FORMAT
-# if HWLOC_HAVE_ATTRIBUTE_FORMAT
-#  define __hwloc_attribute_format(type, str, arg)  __attribute__((__format__(type, str, arg)))
-# else
-#  define __hwloc_attribute_format(type, str, arg)
-# endif
-#else
-# define __hwloc_attribute_format(type, str, arg)
-#endif
-
-#define hwloc_memory_size_printf_value(_size, _verbose) \
-  ((_size) < (10ULL<<20) || _verbose ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
-#define hwloc_memory_size_printf_unit(_size, _verbose) \
-  ((_size) < (10ULL<<20) || _verbose ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
-
 /* On some systems, snprintf returns the size of written data, not the actually
- * required size.  hwloc_snprintf always report the actually required size. */
+ * required size. Sometimes it returns -1 on truncation too.
+ * And sometimes it doesn't like NULL output buffers.
+ * http://www.gnu.org/software/gnulib/manual/html_node/snprintf.html
+ *
+ * hwloc_snprintf behaves properly, but it's a bit overkill on the vast majority
+ * of platforms, so don't enable it unless really needed.
+ */
+#ifdef HWLOC_HAVE_CORRECT_SNPRINTF
+#define hwloc_snprintf snprintf
+#else
 extern int hwloc_snprintf(char *str, size_t size, const char *format, ...) __hwloc_attribute_format(printf, 3, 4);
-
-extern void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup);
+#endif
 
 /* Return the name of the currently running program, if supported.
  * If not NULL, must be freed by the caller.
  */
 extern char * hwloc_progname(struct hwloc_topology *topology);
 
-#define HWLOC_BITMAP_EQUAL 0       /* Bitmaps are equal */
-#define HWLOC_BITMAP_INCLUDED 1    /* First bitmap included in second */
-#define HWLOC_BITMAP_CONTAINS 2    /* First bitmap contains second */
-#define HWLOC_BITMAP_INTERSECTS 3  /* Bitmaps intersect without any inclusion */
-#define HWLOC_BITMAP_DIFFERENT  4  /* Bitmaps do not intersect */
-
-/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view.
+/* obj->attr->group.kind internal values.
+ * the core will keep the smallest ones when merging two groups,
+ * that's why user-given kinds are first.
  */
-HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+/* first, user-given groups, should remain as long as possible */
+#define HWLOC_GROUP_KIND_USER				0	/* user-given, user may use subkind too */
+#define HWLOC_GROUP_KIND_SYNTHETIC			10	/* subkind is group depth within synthetic description */
+/* then, hardware-specific groups */
+#define HWLOC_GROUP_KIND_INTEL_KNL_SUBNUMA_CLUSTER	100	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_EXTTOPOENUM_UNKNOWN	101	/* subkind is unknown level */
+#define HWLOC_GROUP_KIND_INTEL_MODULE			102	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_TILE			103	/* no subkind */
+#define HWLOC_GROUP_KIND_INTEL_DIE			104	/* no subkind */
+#define HWLOC_GROUP_KIND_S390_BOOK			110	/* subkind 0 is book, subkind 1 is drawer (group of books) */
+#define HWLOC_GROUP_KIND_AMD_COMPUTE_UNIT		120	/* no subkind */
+/* then, OS-specific groups */
+#define HWLOC_GROUP_KIND_SOLARIS_PG_HW_PERF		200	/* subkind is group width */
+#define HWLOC_GROUP_KIND_AIX_SDL_UNKNOWN		210	/* subkind is SDL level */
+#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP	220	/* no subkind */
+#define HWLOC_GROUP_KIND_WINDOWS_RELATIONSHIP_UNKNOWN	221	/* no subkind */
+/* distance groups */
+#define HWLOC_GROUP_KIND_DISTANCE			900	/* subkind is round of adding these groups during distance based grouping */
+/* finally, hwloc-specific groups required to insert something else, should disappear as soon as possible */
+#define HWLOC_GROUP_KIND_IO				1000	/* no subkind */
+#define HWLOC_GROUP_KIND_MEMORY				1001	/* no subkind */
+
+/* memory allocator for topology objects */
+struct hwloc_tma {
+  void * (*malloc)(struct hwloc_tma *, size_t);
+  void *data;
+  int dontfree; /* when set, free() or realloc() cannot be used, and tma->malloc() cannot fail */
+};
+
+static __hwloc_inline void *
+hwloc_tma_malloc(struct hwloc_tma *tma,
+		 size_t size)
+{
+  if (tma) {
+    return tma->malloc(tma, size);
+  } else {
+    return malloc(size);
+  }
+}
+
+static __hwloc_inline void *
+hwloc_tma_calloc(struct hwloc_tma *tma,
+		 size_t size)
+{
+  char *ptr = hwloc_tma_malloc(tma, size);
+  if (ptr)
+    memset(ptr, 0, size);
+  return ptr;
+}
+
+static __hwloc_inline char *
+hwloc_tma_strdup(struct hwloc_tma *tma,
+		 const char *src)
+{
+  size_t len = strlen(src);
+  char *ptr = hwloc_tma_malloc(tma, len+1);
+  if (ptr)
+    memcpy(ptr, src, len+1);
+  return ptr;
+}
+
+/* bitmap allocator to be used inside hwloc */
+extern hwloc_bitmap_t hwloc_bitmap_tma_dup(struct hwloc_tma *tma, hwloc_const_bitmap_t old);
+
+extern int hwloc__topology_dup(hwloc_topology_t *newp, hwloc_topology_t old, struct hwloc_tma *tma);
+extern void hwloc__topology_disadopt(hwloc_topology_t  topology);
+
 #endif /* HWLOC_PRIVATE_H */
diff --git a/ext/hwloc/include/private/solaris-chiptype.h b/ext/hwloc/include/private/solaris-chiptype.h
index 4af80d88f..4ad2130a0 100644
--- a/ext/hwloc/include/private/solaris-chiptype.h
+++ b/ext/hwloc/include/private/solaris-chiptype.h
@@ -1,6 +1,7 @@
 /*
  * Copyright © 2009-2010 Oracle and/or its affiliates.  All rights reserved.
  *
+ * Copyright © 2017 Inria.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -21,39 +22,22 @@
 #ifndef HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
 #define HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
 
-/* SPARC Chip Modes. */
-#define MODE_UNKNOWN            0
-#define MODE_SPITFIRE           1
-#define MODE_BLACKBIRD          2
-#define MODE_CHEETAH            3
-#define MODE_SPARC64_VI         4
-#define MODE_T1                 5
-#define MODE_T2                 6
-#define MODE_SPARC64_VII        7
-#define MODE_ROCK               8
-
-/* SPARC Chip Implementations. */
-#define IMPL_SPARC64_VI         0x6
-#define IMPL_SPARC64_VII        0x7
-#define IMPL_SPITFIRE           0x10
-#define IMPL_BLACKBIRD          0x11
-#define IMPL_SABRE              0x12
-#define IMPL_HUMMINGBIRD        0x13
-#define IMPL_CHEETAH            0x14
-#define IMPL_CHEETAHPLUS        0x15
-#define IMPL_JALAPENO           0x16
-#define IMPL_JAGUAR             0x18
-#define IMPL_PANTHER            0x19
-#define IMPL_NIAGARA            0x23
-#define IMPL_NIAGARA_2          0x24
-#define IMPL_ROCK               0x25
-
-/* Default Mfg, Cache, Speed settings */
-#define TI_MANUFACTURER         0x17
-#define TWO_MEG_CACHE           2097152
-#define SPITFIRE_SPEED          142943750
-
-char* hwloc_solaris_get_chip_type(void);
-char* hwloc_solaris_get_chip_model(void);
+struct hwloc_solaris_chip_info_s {
+  char *model;
+  char *type;
+  /* L1i, L1d, L2, L3 */
+#define HWLOC_SOLARIS_CHIP_INFO_L1I 0
+#define HWLOC_SOLARIS_CHIP_INFO_L1D 1
+#define HWLOC_SOLARIS_CHIP_INFO_L2I 2
+#define HWLOC_SOLARIS_CHIP_INFO_L2D 3
+#define HWLOC_SOLARIS_CHIP_INFO_L3  4
+  long cache_size[5]; /* cleared to -1 if we don't want of that cache */
+  unsigned cache_linesize[5];
+  unsigned cache_associativity[5];
+  int l2_unified;
+};
+
+/* fills the structure with 0 on error */
+extern void hwloc_solaris_get_chip_info(struct hwloc_solaris_chip_info_s *info);
 
 #endif /* HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H */
diff --git a/ext/hwloc/include/private/xml.h b/ext/hwloc/include/private/xml.h
index 75c6c43ba..f59fca1ff 100644
--- a/ext/hwloc/include/private/xml.h
+++ b/ext/hwloc/include/private/xml.h
@@ -1,12 +1,12 @@
 /*
- * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2017 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
 #ifndef PRIVATE_XML_H
 #define PRIVATE_XML_H 1
 
-#include <hwloc.h>
+#include "hwloc.h"
 
 #include <sys/types.h>
 
@@ -28,27 +28,32 @@ typedef struct hwloc__xml_import_state_s {
   char data[32];
 } * hwloc__xml_import_state_t;
 
+struct hwloc__xml_imported_v1distances_s {
+  unsigned long kind;
+  unsigned nbobjs;
+  float *floats;
+  struct hwloc__xml_imported_v1distances_s *prev, *next;
+};
+
 HWLOC_DECLSPEC int hwloc__xml_import_diff(hwloc__xml_import_state_t state, hwloc_topology_diff_t *firstdiffp);
 
 struct hwloc_xml_backend_data_s {
   /* xml backend parameters */
   int (*look_init)(struct hwloc_xml_backend_data_s *bdata, struct hwloc__xml_import_state_s *state);
-  void (*look_failed)(struct hwloc_xml_backend_data_s *bdata);
+  void (*look_done)(struct hwloc_xml_backend_data_s *bdata, int result);
   void (*backend_exit)(struct hwloc_xml_backend_data_s *bdata);
   int (*next_attr)(struct hwloc__xml_import_state_s * state, char **namep, char **valuep);
   int (*find_child)(struct hwloc__xml_import_state_s * state, struct hwloc__xml_import_state_s * childstate, char **tagp);
   int (*close_tag)(struct hwloc__xml_import_state_s * state); /* look for an explicit closing tag </name> */
   void (*close_child)(struct hwloc__xml_import_state_s * state);
-  int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length);
+  int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length); /* return 0 on empty content (and sets beginp to empty string), 1 on actual content, -1 on error or unexpected content length */
   void (*close_content)(struct hwloc__xml_import_state_s * state);
   char * msgprefix;
   void *data; /* libxml2 doc, or nolibxml buffer */
-  int nbnumanodes;
-  struct hwloc_xml_imported_distances_s {
-    hwloc_obj_t root;
-    struct hwloc_distances_s distances;
-    struct hwloc_xml_imported_distances_s *prev, *next;
-  } *first_distances, *last_distances;
+  unsigned version_major, version_minor;
+  unsigned nbnumanodes;
+  hwloc_obj_t first_numanode, last_numanode; /* temporary cousin-list for handling v1distances */
+  struct hwloc__xml_imported_v1distances_s *first_v1dist, *last_v1dist;
 };
 
 /**************
@@ -63,13 +68,17 @@ typedef struct hwloc__xml_export_state_s {
   void (*add_content)(struct hwloc__xml_export_state_s *state, const char *buffer, size_t length);
   void (*end_object)(struct hwloc__xml_export_state_s *state, const char *name);
 
+  struct hwloc__xml_export_data_s {
+    hwloc_obj_t v1_memory_group; /* if we need to insert intermediate group above memory children when exporting to v1 */
+  } *global;
+
   /* opaque data used to store backend-specific data.
    * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
    */
   char data[40];
 } * hwloc__xml_export_state_t;
 
-HWLOC_DECLSPEC void hwloc__xml_export_object (hwloc__xml_export_state_t state, struct hwloc_topology *topology, struct hwloc_obj *obj);
+HWLOC_DECLSPEC void hwloc__xml_export_topology(hwloc__xml_export_state_t parentstate, hwloc_topology_t topology, unsigned long flags);
 
 HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff);
 
@@ -79,8 +88,8 @@ HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate
 
 struct hwloc_xml_callbacks {
   int (*backend_init)(struct hwloc_xml_backend_data_s *bdata, const char *xmlpath, const char *xmlbuffer, int xmlbuflen);
-  int (*export_file)(struct hwloc_topology *topology, const char *filename);
-  int (*export_buffer)(struct hwloc_topology *topology, char **xmlbuffer, int *buflen);
+  int (*export_file)(struct hwloc_topology *topology, struct hwloc__xml_export_data_s *edata, const char *filename, unsigned long flags);
+  int (*export_buffer)(struct hwloc_topology *topology, struct hwloc__xml_export_data_s *edata, char **xmlbuffer, int *buflen, unsigned long flags);
   void (*free_buffer)(void *xmlbuffer);
   int (*import_diff)(struct hwloc__xml_import_state_s *state, const char *xmlpath, const char *xmlbuffer, int xmlbuflen, hwloc_topology_diff_t *diff, char **refnamep);
   int (*export_diff_file)(union hwloc_topology_diff_u *diff, const char *refname, const char *filename);
diff --git a/ext/hwloc/include/static-components.h b/ext/hwloc/include/static-components.h
index 97a874978..6ee2e6826 100644
--- a/ext/hwloc/include/static-components.h
+++ b/ext/hwloc/include/static-components.h
@@ -11,7 +11,7 @@ static const struct hwloc_component * hwloc_static_components[] = {
   &hwloc_synthetic_component,
 //  &hwloc_xml_nolibxml_component,
   &hwloc_linux_component,
-  &hwloc_linuxpci_component,
+//  &hwloc_linuxpci_component,
 #if defined(__i386__) || defined(__x86_64)
   &hwloc_x86_component,
 #endif
diff --git a/ext/lua/Makefile b/ext/lua/Makefile
index f69ba2640..43a0fb9cf 100644
--- a/ext/lua/Makefile
+++ b/ext/lua/Makefile
@@ -14,12 +14,12 @@ DEFINES   = -DLUA_COMPAT_ALL -DLUA_COMPAT_5_2 -DLUA_USE_LINUX
 LIBS      = -lm -Wl,-E -ldl
 
 Q         ?= @
-ifeq ($(DEBUG),true)
+ifeq ($(strip $(DEBUG)),true)
 DEBUG_FLAGS = -g
 else
 DEBUG_FLAGS =
 endif
-ifeq ($(COMPILER),MIC)
+ifeq ($(strip $(COMPILER)),MIC)
 CFLAGS += -mmic
 LFLAGS += -mmic
 endif
diff --git a/filters/json b/filters/json
index bb17edef5..4ff319676 100755
--- a/filters/json
+++ b/filters/json
@@ -22,14 +22,29 @@ else:
 outfile = filename.replace("csv", "json").replace(".tmp", "")
 o_fp = open(outfile, "w")
 
+def tryint(s):
+    o = s
+    try:
+        o = int(s)
+    except:
+        t = s.replace(" ", "")
+        try:
+            t = int(t)
+            elems = [ int(e) for e in s.split(" ") if len(e) > 0 ]
+            o = elems
+        except: pass
+    return o
+
 def fill_table(lines, headlist):
     out = []
     for l in lines:
-        elems = [ e for e in re.split("\s*,\s*", l) if len(e) > 0 ]
-        
+        elems = [ tryint(e) for e in re.split("\s*,\s*", l) if len(e) > 0 ]
         t = {}
         for h, v in zip(headlist, elems):
-            t.update({h : v.replace(" STAT", "")})
+            if type(v) == "str":
+                t.update({h : v.replace(" STAT", "")})
+            else:
+                t.update({h : v})
         #print("%%TABLE: %s " % str(t))
         out.append(t)
     return out
@@ -37,8 +52,7 @@ def fill_table(lines, headlist):
 def fill_struct(lines):
     struct = {}
     for l in lines:
-        elems = [ e for e in re.split("\s*,\s*", l) if len(e) > 0 ]
-        
+        elems = [ tryint(e) for e in re.split("\s*,\s*", l) if len(e) > 0 ]
         if len(elems) == 2:
             struct.update({ elems[0] : elems[1] })
         else:
@@ -46,6 +60,8 @@ def fill_struct(lines):
         #print("%%STRUCT: %s " % str(struct))
     return struct
 
+tables = {}
+groups = []
 
 if filetype == "perfctr" or filetype == "topology":
     inp = i_fp.read().strip().split("\n")
@@ -66,12 +82,15 @@ if filetype == "perfctr" or filetype == "topology":
             name = llist[1]
             lines = int(llist[2])
             s = fill_struct(inp[i+1:i+lines+1])
-            out.update({name: s})
+            info_struct = s;
+            info_name = name
+            tables.update({name: s})
             i += lines-1
         elif l.startswith("TABLE"):
             #sys.stderr.write(l+"\n")
             name = None
             gid = None
+            grp = None
             gname = None
             reg = None
             regname = None
@@ -101,17 +120,97 @@ if filetype == "perfctr" or filetype == "topology":
 
             t = fill_table(inp[i+2:i+lines+2], hlist)
             if grp and gname:
-                if grp not in out:
-                    out[grp] = {}
-                out[grp].update({gname : t})
+                if grp not in tables:
+                    tables[grp] = {}
+                    groups.append(grp)
+                tables[grp].update({gname : t})
             elif gid and gname:
-                if gid not in out:
-                    out[gid] = {}
-                out[gid].update({gname : t})
+                if gid not in tables:
+                    tables[gid] = {}
+                    groups.append(grp)
+                tables[gid].update({gname : t})
             else:
-                out.update({tabname : t})
+                tables.update({tabname : t})
             i += lines-1
         i += 1
+
+    if filetype == "perfctr":
+        cpulist = []
+        gpulist = []
+
+        for elems in tables[groups[0]]["Raw"]:
+            for k in elems:
+                m = re.match("Core (\d+)", k)
+                if m:
+                    cpulist.append(int(m.group(1)))
+                m = re.match("GPU (\d+)", k)
+                if m:
+                    gpulist.append(int(m.group(1)))
+
+        for g in groups:
+            counters = []
+            metrics = []
+            for tabkey in tables[g]:
+                if not "STAT" in tabkey:
+                    for line in tables[g][tabkey]:
+                        for k in line:
+                            if k == "Counter": counters.append(line[k])
+                            if k == "Metric": metrics.append(line[k])
+            group = {}
+            for tabkey in tables[g]:
+                if g not in group:
+                    group[g] = {}
+                group[g][tabkey] = {}
+                if "Raw" in tabkey and not "STAT" in tabkey:
+                    for c in counters:
+                        event = None
+                        values = []
+                        for line in tables[g][tabkey]:
+                            if line["Counter"] == c:
+                                event = line["Event"]
+                                values = []
+                                for k in line:
+                                    if k != "Counter" and k != "Event":
+                                        values.append(float(line[k]))
+                                group[g][tabkey][c] = {"Event" : event, "Values" : values}
+                                break
+                elif "Metric" in tabkey and not "STAT" in tabkey:
+                    for m in metrics:
+                        values = []
+                        for line in tables[g][tabkey]:
+                            if line["Metric"] == m:
+                                values = []
+                                for k in line:
+                                    if k != "Metric":
+                                        values.append(float(line[k]))
+                                group[g][tabkey][m] = {"Values" : values}
+                                break
+                else:
+                    new = {}
+                    for line in tables[g][tabkey]:
+                        key = None
+                        tmp = {}
+                        if "Counter" in line:
+                            key = line["Counter"]
+                        elif "Metric" in line:
+                            key = line["Metric"]
+                        for k in line:
+                            if line[k] != key:
+                                v = line[k]
+                                try:
+                                    v = float(v)
+                                except: pass
+                                tmp.update({k: v})
+                        new[key] = tmp
+                    group[g][tabkey] = new
+
+            out[g] = group
+        tables["Info"].update({"CPU list" : list(set(cpulist))})
+        if len(gpulist) > 0:
+            tables["Info"].update({"GPU list" : list(set(gpulist))})
+        out["Info"] = tables["Info"]
+    else:
+        out = tables
     o_fp.write(json.dumps(out, indent=4, sort_keys=True)+"\n")
 else:
     sys.stderr.write("Filter failed! Unknown application type %s!" % filetype)
diff --git a/groups/CLX/CLOCK.txt b/groups/CLX/CLOCK.txt
index 5ff9f6961..b81bee6d1 100644
--- a/groups/CLX/CLOCK.txt
+++ b/groups/CLX/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/CLX/CYCLE_ACTIVITY.txt b/groups/CLX/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/CLX/CYCLE_ACTIVITY.txt
+++ b/groups/CLX/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/CLX/CYCLE_STALLS.txt b/groups/CLX/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/CLX/CYCLE_STALLS.txt
+++ b/groups/CLX/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/CLX/DIVIDE.txt b/groups/CLX/DIVIDE.txt
index 4f5a0eabf..2c6222dc3 100644
--- a/groups/CLX/DIVIDE.txt
+++ b/groups/CLX/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+--
 This performance group measures the average latency of divide operations
diff --git a/groups/CLX/ENERGY.txt b/groups/CLX/ENERGY.txt
index 28f02567e..fe7829fbe 100644
--- a/groups/CLX/ENERGY.txt
+++ b/groups/CLX/ENERGY.txt
@@ -25,7 +25,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/CLX/FLOPS_AVX.txt b/groups/CLX/FLOPS_AVX.txt
index d3ea8004b..d65b522be 100644
--- a/groups/CLX/FLOPS_AVX.txt
+++ b/groups/CLX/FLOPS_AVX.txt
@@ -14,13 +14,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*16)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*16)/runtime
 -
 Packed 32b AVX FLOPs rates.
 
diff --git a/groups/CLX/FLOPS_DP.txt b/groups/CLX/FLOPS_DP.txt
index 11091902a..7d6af795a 100644
--- a/groups/CLX/FLOPS_DP.txt
+++ b/groups/CLX/FLOPS_DP.txt
@@ -14,21 +14,20 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
-AVX512 DP MFLOP/s  1.0E-06*(PMC3*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
 
 LONG
-Formula:
-DP MFLOP/s =
-1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX512 DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates.
diff --git a/groups/CLX/FLOPS_SP.txt b/groups/CLX/FLOPS_SP.txt
index 2dc3428c6..39fb08d63 100644
--- a/groups/CLX/FLOPS_SP.txt
+++ b/groups/CLX/FLOPS_SP.txt
@@ -14,22 +14,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
-AVX512 SP MFLOP/s  1.0E-06*(PMC3*16.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
 
 LONG
-Formula:
-SP MFLOP/s =
-1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX512 SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/CLX/MEM_DP.txt b/groups/CLX/MEM_DP.txt
index 518fad723..68e868453 100644
--- a/groups/CLX/MEM_DP.txt
+++ b/groups/CLX/MEM_DP.txt
@@ -32,10 +32,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
@@ -45,13 +45,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
diff --git a/groups/CLX/MEM_SP.txt b/groups/CLX/MEM_SP.txt
index 79dabcc99..73452f222 100644
--- a/groups/CLX/MEM_SP.txt
+++ b/groups/CLX/MEM_SP.txt
@@ -32,10 +32,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
@@ -45,13 +45,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
diff --git a/groups/CLX/TMA.txt b/groups/CLX/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/CLX/TMA.txt
+++ b/groups/CLX/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/arm8_tx2/FLOPS_DP.txt b/groups/arm8_tx2/FLOPS_DP.txt
index 2f96ca3de..5b477def2 100644
--- a/groups/arm8_tx2/FLOPS_DP.txt
+++ b/groups/arm8_tx2/FLOPS_DP.txt
@@ -10,18 +10,18 @@ METRICS
 Runtime (RDTSC) [s] time
 Clock [MHz] 1.E-06*PMC1/time
 CPI  PMC1/PMC0
-DP MFLOP/s  1.0E-06*(PMC3*2.0+PMC2)/time
-NEON DP MFLOP/s  1.0E-06*(PMC3*2.0)/time
-Packed MUOPS/s   1.0E-06*(PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC2/time
+DP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time
+NEON DP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC2/time
 Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
-NEON DP MFLOP/s = 1.0E-06*(ASE_SPEC*4)/runtime
-Packed MUOPS/s = 1.0E-06*(ASE_SPEC)/runtime
-Scalar MUOPS/s = 1.0E-06*VFP_SPEC/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
+NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
+Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
 Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
 -
 NEON scalar and packed double precision FLOP rates.
diff --git a/groups/arm8_tx2/FLOPS_SP.txt b/groups/arm8_tx2/FLOPS_SP.txt
index 403321035..9857308bf 100644
--- a/groups/arm8_tx2/FLOPS_SP.txt
+++ b/groups/arm8_tx2/FLOPS_SP.txt
@@ -10,18 +10,18 @@ METRICS
 Runtime (RDTSC) [s] time
 Clock [MHz] 1.E-06*PMC1/time
 CPI  PMC1/PMC0
-SP MFLOP/s  1.0E-06*(PMC3*2.0+PMC2)/time
-NEON SP MFLOP/s  1.0E-06*(PMC3*2.0)/time
-Packed MUOPS/s   1.0E-06*(PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC2/time
+SP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time
+NEON SP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC2/time
 Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
-NEON SP MFLOP/s = 1.0E-06*(ASE_SPEC*4)/runtime
-Packed MUOPS/s = 1.0E-06*(ASE_SPEC)/runtime
-Scalar MUOPS/s = 1.0E-06*VFP_SPEC/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
+NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
+Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
 Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
 -
 NEON scalar and packed single precision FLOP rates.
diff --git a/groups/arm8_tx2/MEM.txt b/groups/arm8_tx2/MEM.txt
index 7b1e80d3a..06bc6971d 100644
--- a/groups/arm8_tx2/MEM.txt
+++ b/groups/arm8_tx2/MEM.txt
@@ -3,28 +3,30 @@ SHORT Main memory bandwidth in MBytes/s
 EVENTSET
 PMC0  INST_RETIRED
 PMC1  CPU_CYCLES
-PMC2  MEM_ACCESS_LD
-PMC3  MEM_ACCESS_ST
-
+MBOX0C0  MEMORY_READS
+MBOX0C1  MEMORY_WRITES
+MBOX1C0  MEMORY_READS
+MBOX1C1  MEMORY_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Clock [MHz] 1.E-06*PMC1/time
 CPI  PMC1/PMC0
-Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
-Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
-Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
-Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
-Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0
 
 LONG
 Formulas:
-Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_LD)*64.0/runtime
-Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_LD)*64.0
-Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_ST)*64.0/runtime
-Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_ST)*64.0
-Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_LD+MEM_ACCESS_ST)*64.0/runtime
-Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_LD+MEM_ACCESS_ST)*64.0
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0
 -
-Profiling group to measure memory bandwidth.
+Profiling group to measure memory bandwidth. It uses the performance monitoring
+hardware of the memory controllers.
diff --git a/groups/arm8_tx2/SPEC.txt b/groups/arm8_tx2/SPEC.txt
index ff68cda5d..7561d3a4b 100644
--- a/groups/arm8_tx2/SPEC.txt
+++ b/groups/arm8_tx2/SPEC.txt
@@ -29,6 +29,7 @@ Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0
 
 
 LONG
+Formulas:
 Load ops spec. ratio = LD_SPEC / INST_SPEC
 Store ops spec. ratio = ST_SPEC / INST_SPEC
 Integer data ops spec. ratio = DP_SPEC / INST_SPEC
diff --git a/groups/atom/FLOPS_DP.txt b/groups/atom/FLOPS_DP.txt
index 891a15e01..53b2d02cb 100644
--- a/groups/atom/FLOPS_DP.txt
+++ b/groups/atom/FLOPS_DP.txt
@@ -10,16 +10,16 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s    1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]    1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
-Packed MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
-Scalar MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
 --
-Double Precision MFLOP/s Double Precision MFLOP/s
+Double Precision [MFLOP/s] Double Precision MFLOP/s
 
diff --git a/groups/atom/FLOPS_SP.txt b/groups/atom/FLOPS_SP.txt
index 3ebce6c23..0046d5bfd 100644
--- a/groups/atom/FLOPS_SP.txt
+++ b/groups/atom/FLOPS_SP.txt
@@ -10,15 +10,15 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*(PMC0)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
-Packed MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
-Scalar MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
+SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
 --
 Single Precision MFLOP/s Double Precision MFLOP/s
 
diff --git a/groups/atom/FLOPS_X87.txt b/groups/atom/FLOPS_X87.txt
index 204a61e13..58c5d4222 100644
--- a/groups/atom/FLOPS_X87.txt
+++ b/groups/atom/FLOPS_X87.txt
@@ -9,10 +9,11 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
-X87 MFLOP/s = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime
 --
 The MFLOP/s made with X87 instructions
 
diff --git a/groups/broadwell/CLOCK.txt b/groups/broadwell/CLOCK.txt
index 5ff9f6961..b81bee6d1 100644
--- a/groups/broadwell/CLOCK.txt
+++ b/groups/broadwell/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/broadwell/CYCLE_ACTIVITY.txt b/groups/broadwell/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/broadwell/CYCLE_ACTIVITY.txt
+++ b/groups/broadwell/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/broadwell/CYCLE_STALLS.txt b/groups/broadwell/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/broadwell/CYCLE_STALLS.txt
+++ b/groups/broadwell/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/broadwell/DIVIDE.txt b/groups/broadwell/DIVIDE.txt
index 303bbdade..077dfd3f0 100644
--- a/groups/broadwell/DIVIDE.txt
+++ b/groups/broadwell/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/broadwell/ENERGY.txt b/groups/broadwell/ENERGY.txt
index ae1756fdc..09eaeb140 100644
--- a/groups/broadwell/ENERGY.txt
+++ b/groups/broadwell/ENERGY.txt
@@ -28,7 +28,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/broadwell/FALSE_SHARE.txt b/groups/broadwell/FALSE_SHARE.txt
index bb26898fc..a297654b4 100644
--- a/groups/broadwell/FALSE_SHARE.txt
+++ b/groups/broadwell/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
 -
diff --git a/groups/broadwell/FLOPS_AVX.txt b/groups/broadwell/FLOPS_AVX.txt
index eb047fa5f..785460812 100644
--- a/groups/broadwell/FLOPS_AVX.txt
+++ b/groups/broadwell/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
 -
 FLOP rates of 256 bit packed floating-point instructions
 
diff --git a/groups/broadwell/FLOPS_DP.txt b/groups/broadwell/FLOPS_DP.txt
index 2419b8411..348ec7683 100644
--- a/groups/broadwell/FLOPS_DP.txt
+++ b/groups/broadwell/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
 -
 AVX/SSE scalar and packed double precision FLOP rates.
diff --git a/groups/broadwell/FLOPS_SP.txt b/groups/broadwell/FLOPS_SP.txt
index b6c46095b..1d7fd7c44 100644
--- a/groups/broadwell/FLOPS_SP.txt
+++ b/groups/broadwell/FLOPS_SP.txt
@@ -13,19 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
 -
 AVX/SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/broadwell/PORT_USAGE.txt b/groups/broadwell/PORT_USAGE.txt
index 459f7f66d..298df1d23 100644
--- a/groups/broadwell/PORT_USAGE.txt
+++ b/groups/broadwell/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port1 usage ratio UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port2 usage ratio UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port3 usage ratio UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port4 usage ratio UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port5 usage ratio UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port6 usage ratio UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port7 usage ratio UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/broadwell/TMA.txt b/groups/broadwell/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/broadwell/TMA.txt
+++ b/groups/broadwell/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/broadwell/UOPS.txt b/groups/broadwell/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/broadwell/UOPS.txt
+++ b/groups/broadwell/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/broadwellD/CLOCK.txt b/groups/broadwellD/CLOCK.txt
index 5ff9f6961..b81bee6d1 100644
--- a/groups/broadwellD/CLOCK.txt
+++ b/groups/broadwellD/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/broadwellD/CYCLE_ACTIVITY.txt b/groups/broadwellD/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/broadwellD/CYCLE_ACTIVITY.txt
+++ b/groups/broadwellD/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/broadwellD/CYCLE_STALLS.txt b/groups/broadwellD/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/broadwellD/CYCLE_STALLS.txt
+++ b/groups/broadwellD/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/broadwellD/DIVIDE.txt b/groups/broadwellD/DIVIDE.txt
index 303bbdade..077dfd3f0 100644
--- a/groups/broadwellD/DIVIDE.txt
+++ b/groups/broadwellD/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/broadwellD/ENERGY.txt b/groups/broadwellD/ENERGY.txt
index ae1756fdc..09eaeb140 100644
--- a/groups/broadwellD/ENERGY.txt
+++ b/groups/broadwellD/ENERGY.txt
@@ -28,7 +28,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/broadwellD/FALSE_SHARE.txt b/groups/broadwellD/FALSE_SHARE.txt
index dd2a44b7f..68107bf4b 100644
--- a/groups/broadwellD/FALSE_SHARE.txt
+++ b/groups/broadwellD/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
 -
diff --git a/groups/broadwellD/FLOPS_AVX.txt b/groups/broadwellD/FLOPS_AVX.txt
index eb047fa5f..785460812 100644
--- a/groups/broadwellD/FLOPS_AVX.txt
+++ b/groups/broadwellD/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
 -
 FLOP rates of 256 bit packed floating-point instructions
 
diff --git a/groups/broadwellD/FLOPS_DP.txt b/groups/broadwellD/FLOPS_DP.txt
index 2419b8411..348ec7683 100644
--- a/groups/broadwellD/FLOPS_DP.txt
+++ b/groups/broadwellD/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
 -
 AVX/SSE scalar and packed double precision FLOP rates.
diff --git a/groups/broadwellD/FLOPS_SP.txt b/groups/broadwellD/FLOPS_SP.txt
index b6c46095b..1d7fd7c44 100644
--- a/groups/broadwellD/FLOPS_SP.txt
+++ b/groups/broadwellD/FLOPS_SP.txt
@@ -13,19 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
 -
 AVX/SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/broadwellD/MEM_DP.txt b/groups/broadwellD/MEM_DP.txt
index 33dac2b5e..71ce2aea7 100644
--- a/groups/broadwellD/MEM_DP.txt
+++ b/groups/broadwellD/MEM_DP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
diff --git a/groups/broadwellD/MEM_SP.txt b/groups/broadwellD/MEM_SP.txt
index 1f8614937..6d67ea777 100644
--- a/groups/broadwellD/MEM_SP.txt
+++ b/groups/broadwellD/MEM_SP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
diff --git a/groups/broadwellD/PORT_USAGE.txt b/groups/broadwellD/PORT_USAGE.txt
index 459f7f66d..298df1d23 100644
--- a/groups/broadwellD/PORT_USAGE.txt
+++ b/groups/broadwellD/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port1 usage ratio UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port2 usage ratio UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port3 usage ratio UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port4 usage ratio UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port5 usage ratio UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port6 usage ratio UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port7 usage ratio UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/broadwellD/TMA.txt b/groups/broadwellD/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/broadwellD/TMA.txt
+++ b/groups/broadwellD/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/broadwellD/UOPS.txt b/groups/broadwellD/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/broadwellD/UOPS.txt
+++ b/groups/broadwellD/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/broadwellEP/CLOCK.txt b/groups/broadwellEP/CLOCK.txt
index 5ff9f6961..b81bee6d1 100644
--- a/groups/broadwellEP/CLOCK.txt
+++ b/groups/broadwellEP/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/broadwellEP/CYCLE_ACTIVITY.txt b/groups/broadwellEP/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/broadwellEP/CYCLE_ACTIVITY.txt
+++ b/groups/broadwellEP/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/broadwellEP/CYCLE_STALLS.txt b/groups/broadwellEP/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/broadwellEP/CYCLE_STALLS.txt
+++ b/groups/broadwellEP/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/broadwellEP/DIVIDE.txt b/groups/broadwellEP/DIVIDE.txt
index 303bbdade..077dfd3f0 100644
--- a/groups/broadwellEP/DIVIDE.txt
+++ b/groups/broadwellEP/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/broadwellEP/ENERGY.txt b/groups/broadwellEP/ENERGY.txt
index 28f02567e..fe7829fbe 100644
--- a/groups/broadwellEP/ENERGY.txt
+++ b/groups/broadwellEP/ENERGY.txt
@@ -25,7 +25,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/broadwellEP/FALSE_SHARE.txt b/groups/broadwellEP/FALSE_SHARE.txt
index 1a2fd7028..602b606a1 100644
--- a/groups/broadwellEP/FALSE_SHARE.txt
+++ b/groups/broadwellEP/FALSE_SHARE.txt
@@ -19,7 +19,7 @@ Remote LLC false sharing [MByte] 1.E-06*PMC1*64
 Remote LLC false sharing rate PMC1/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
 Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
diff --git a/groups/broadwellEP/FLOPS_AVX.txt b/groups/broadwellEP/FLOPS_AVX.txt
index eb047fa5f..785460812 100644
--- a/groups/broadwellEP/FLOPS_AVX.txt
+++ b/groups/broadwellEP/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
 -
 FLOP rates of 256 bit packed floating-point instructions
 
diff --git a/groups/broadwellEP/FLOPS_DP.txt b/groups/broadwellEP/FLOPS_DP.txt
index 2419b8411..348ec7683 100644
--- a/groups/broadwellEP/FLOPS_DP.txt
+++ b/groups/broadwellEP/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
 -
 AVX/SSE scalar and packed double precision FLOP rates.
diff --git a/groups/broadwellEP/FLOPS_SP.txt b/groups/broadwellEP/FLOPS_SP.txt
index b6c46095b..1d7fd7c44 100644
--- a/groups/broadwellEP/FLOPS_SP.txt
+++ b/groups/broadwellEP/FLOPS_SP.txt
@@ -13,19 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
 -
 AVX/SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/broadwellEP/MEM_DP.txt b/groups/broadwellEP/MEM_DP.txt
index cdecadf0c..6078d57cf 100644
--- a/groups/broadwellEP/MEM_DP.txt
+++ b/groups/broadwellEP/MEM_DP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
diff --git a/groups/broadwellEP/MEM_SP.txt b/groups/broadwellEP/MEM_SP.txt
index c887bf64d..d18d2ab79 100644
--- a/groups/broadwellEP/MEM_SP.txt
+++ b/groups/broadwellEP/MEM_SP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
diff --git a/groups/broadwellEP/NUMA.txt b/groups/broadwellEP/NUMA.txt
index 8fdd0f168..5b30e2595 100644
--- a/groups/broadwellEP/NUMA.txt
+++ b/groups/broadwellEP/NUMA.txt
@@ -27,7 +27,7 @@ Total data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C
 
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 Local bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64)/time
 Local data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64
diff --git a/groups/broadwellEP/PORT_USAGE.txt b/groups/broadwellEP/PORT_USAGE.txt
index 459f7f66d..298df1d23 100644
--- a/groups/broadwellEP/PORT_USAGE.txt
+++ b/groups/broadwellEP/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port1 usage ratio UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port2 usage ratio UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port3 usage ratio UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port4 usage ratio UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port5 usage ratio UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port6 usage ratio UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port7 usage ratio UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/broadwellEP/QPI.txt b/groups/broadwellEP/QPI.txt
index 20d7cdf05..85947062d 100644
--- a/groups/broadwellEP/QPI.txt
+++ b/groups/broadwellEP/QPI.txt
@@ -31,7 +31,7 @@ QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C
 QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
 
 LONG
-Formula:
+Formulas:
 QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
 QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
 QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
diff --git a/groups/broadwellEP/TLB_DATA.txt b/groups/broadwellEP/TLB_DATA.txt
index 89841d504..54f5e05bc 100644
--- a/groups/broadwellEP/TLB_DATA.txt
+++ b/groups/broadwellEP/TLB_DATA.txt
@@ -23,12 +23,12 @@ L1 DTLB store miss duration PMC3
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses     = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration = DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration = DTLB_STORE_MISSES_WALK_DURATION
 -
 The DTLB load and store miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/broadwellEP/TLB_INSTR.txt b/groups/broadwellEP/TLB_INSTR.txt
index b19545227..647748f4c 100644
--- a/groups/broadwellEP/TLB_INSTR.txt
+++ b/groups/broadwellEP/TLB_INSTR.txt
@@ -19,9 +19,9 @@ L1 ITLB miss duration PMC1
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses     = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration = ITLB_MISSES_WALK_DURATION
 -
 The ITLB miss rates gives a measure how often a TLB miss occurred
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/broadwellEP/TMA.txt b/groups/broadwellEP/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/broadwellEP/TMA.txt
+++ b/groups/broadwellEP/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/broadwellEP/UOPS.txt b/groups/broadwellEP/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/broadwellEP/UOPS.txt
+++ b/groups/broadwellEP/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/core2/CLOCK.txt b/groups/core2/CLOCK.txt
index 4a5986f82..871c4f958 100644
--- a/groups/core2/CLOCK.txt
+++ b/groups/core2/CLOCK.txt
@@ -12,7 +12,7 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE / INSTR_RETIRED_ANY
 -
 Most basic performance group measuring the the clock frequency of the machine.
diff --git a/groups/core2/DIVIDE.txt b/groups/core2/DIVIDE.txt
index 85423f22a..0753b4ee6 100644
--- a/groups/core2/DIVIDE.txt
+++ b/groups/core2/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC1
 Avg. divide unit usage duration PMC0/PMC1
 
 LONG
+Formulas:
+Number of divide ops = DIV
+Avg. divide unit usage duration = CYCLES_DIV_BUSY/DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/core2/FLOPS_DP.txt b/groups/core2/FLOPS_DP.txt
index 2283b927f..e1698ff1b 100644
--- a/groups/core2/FLOPS_DP.txt
+++ b/groups/core2/FLOPS_DP.txt
@@ -11,16 +11,16 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s    1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s 1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]    1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s] 1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*PMC0/PMC1
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
-Packed MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
-Scalar MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
 -
 Profiling group to measure double SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
diff --git a/groups/core2/FLOPS_SP.txt b/groups/core2/FLOPS_SP.txt
index ec128a60e..a2c842cd0 100644
--- a/groups/core2/FLOPS_SP.txt
+++ b/groups/core2/FLOPS_SP.txt
@@ -11,17 +11,17 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*PMC0/PMC1
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
-Packed MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
-Scalar MUOPS/s = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
 -
 Profiling group to measure single precision SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
 On the number of SIMD_COMP_INST_RETIRED_PACKED_SINGLE you can see how well your code was vectorized.
diff --git a/groups/core2/FLOPS_X87.txt b/groups/core2/FLOPS_X87.txt
index d44a2fa51..46309e453 100644
--- a/groups/core2/FLOPS_X87.txt
+++ b/groups/core2/FLOPS_X87.txt
@@ -10,11 +10,11 @@ METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
 Formulas:
-X87 MFLOP/s = 1.0E-06*X87_OPS_RETIRED_ANY/time
+X87 [MFLOP/s] = 1.0E-06*X87_OPS_RETIRED_ANY/time
 -
 Profiling group to measure X87 FLOPs. Note that also non computational operations
 are measured by this event.
diff --git a/groups/core2/UOPS.txt b/groups/core2/UOPS.txt
index 81674162c..5d816d87b 100644
--- a/groups/core2/UOPS.txt
+++ b/groups/core2/UOPS.txt
@@ -18,5 +18,9 @@ Executed UOPs PMC0
 Retired UOPs PMC1
 
 LONG
+Formulas:
+Executed UOPs = RS_UOPS_DISPATCHED_ALL
+Retired UOPs = UOPS_RETIRED_ANY
+-
 Performance group measures the executed and retired micro ops. The difference
 between executed and retired uOPs are the speculatively executed uOPs.
diff --git a/groups/goldmont/CLOCK.txt b/groups/goldmont/CLOCK.txt
index 088a776b1..b2174c82b 100644
--- a/groups/goldmont/CLOCK.txt
+++ b/groups/goldmont/CLOCK.txt
@@ -15,7 +15,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
diff --git a/groups/goldmont/DIVIDE.txt b/groups/goldmont/DIVIDE.txt
index 0a2aaec1e..9fc67025f 100644
--- a/groups/goldmont/DIVIDE.txt
+++ b/groups/goldmont/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC1
 Avg. divide unit usage duration PMC0/PMC1
 
 LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_ALL_COUNT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY_ALL/CYCLES_DIV_BUSY_ALL_COUNT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/goldmont/ENERGY.txt b/groups/goldmont/ENERGY.txt
index b94dd6a84..77705344f 100644
--- a/groups/goldmont/ENERGY.txt
+++ b/groups/goldmont/ENERGY.txt
@@ -23,7 +23,7 @@ Energy DRAM [J]  PWR1
 Power DRAM [W] PWR1/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/haswell/CLOCK.txt b/groups/haswell/CLOCK.txt
index ef2dda180..8055d5b0d 100644
--- a/groups/haswell/CLOCK.txt
+++ b/groups/haswell/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/haswell/CYCLE_ACTIVITY.txt b/groups/haswell/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/haswell/CYCLE_ACTIVITY.txt
+++ b/groups/haswell/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/haswell/CYCLE_STALLS.txt b/groups/haswell/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/haswell/CYCLE_STALLS.txt
+++ b/groups/haswell/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/haswell/DIVIDE.txt b/groups/haswell/DIVIDE.txt
index dbe1e0380..c9690cfe7 100644
--- a/groups/haswell/DIVIDE.txt
+++ b/groups/haswell/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_UOPS
+Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/haswell/ENERGY.txt b/groups/haswell/ENERGY.txt
index e8bed3afd..59242db19 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/haswell/ENERGY.txt
@@ -28,7 +28,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/haswell/FALSE_SHARE.txt b/groups/haswell/FALSE_SHARE.txt
index 43ea23b77..db438a308 100644
--- a/groups/haswell/FALSE_SHARE.txt
+++ b/groups/haswell/FALSE_SHARE.txt
@@ -16,8 +16,7 @@ Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
 Local LLC hit with false sharing rate PMC0/PMC2
 
 LONG
-Formula:
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 -
diff --git a/groups/haswell/FLOPS_AVX.txt b/groups/haswell/FLOPS_AVX.txt
index 9efdd1d7b..15aacb8d7 100644
--- a/groups/haswell/FLOPS_AVX.txt
+++ b/groups/haswell/FLOPS_AVX.txt
@@ -11,13 +11,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC0*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC0*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
 -
 Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
 May count non-AVX instructions that employ 256-bit operations, including (but
diff --git a/groups/haswell/PORT_USAGE.txt b/groups/haswell/PORT_USAGE.txt
index 898cdd159..eb74ffe70 100644
--- a/groups/haswell/PORT_USAGE.txt
+++ b/groups/haswell/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port1 usage ratio UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port2 usage ratio UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port3 usage ratio UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port4 usage ratio UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port5 usage ratio UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port6 usage ratio UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port7 usage ratio UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/haswell/TMA.txt b/groups/haswell/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/haswell/TMA.txt
+++ b/groups/haswell/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/haswell/UOPS.txt b/groups/haswell/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/haswell/UOPS.txt
+++ b/groups/haswell/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/haswellEP/CACHES.txt b/groups/haswellEP/CACHES.txt
index 7721ad5bb..295a1397a 100644
--- a/groups/haswellEP/CACHES.txt
+++ b/groups/haswellEP/CACHES.txt
@@ -111,7 +111,7 @@ L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(
 L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
-Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
 Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
 Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
diff --git a/groups/haswellEP/CLOCK.txt b/groups/haswellEP/CLOCK.txt
index ef2dda180..8055d5b0d 100644
--- a/groups/haswellEP/CLOCK.txt
+++ b/groups/haswellEP/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/haswellEP/CYCLE_ACTIVITY.txt b/groups/haswellEP/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/haswellEP/CYCLE_ACTIVITY.txt
+++ b/groups/haswellEP/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/haswellEP/CYCLE_STALLS.txt b/groups/haswellEP/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/haswellEP/CYCLE_STALLS.txt
+++ b/groups/haswellEP/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/haswellEP/DIVIDE.txt b/groups/haswellEP/DIVIDE.txt
index dbe1e0380..c9690cfe7 100644
--- a/groups/haswellEP/DIVIDE.txt
+++ b/groups/haswellEP/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_UOPS
+Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/haswellEP/ENERGY.txt b/groups/haswellEP/ENERGY.txt
index 6c26b30f9..ee0af1b5c 100644
--- a/groups/haswellEP/ENERGY.txt
+++ b/groups/haswellEP/ENERGY.txt
@@ -25,7 +25,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/haswellEP/FALSE_SHARE.txt b/groups/haswellEP/FALSE_SHARE.txt
index ce1a8bb93..872dbc1cf 100644
--- a/groups/haswellEP/FALSE_SHARE.txt
+++ b/groups/haswellEP/FALSE_SHARE.txt
@@ -19,7 +19,7 @@ Remote LLC false sharing [MByte] 1.E-06*PMC1*64
 Remote LLC false sharing rate PMC1/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
diff --git a/groups/haswellEP/FLOPS_AVX.txt b/groups/haswellEP/FLOPS_AVX.txt
index 9efdd1d7b..15aacb8d7 100644
--- a/groups/haswellEP/FLOPS_AVX.txt
+++ b/groups/haswellEP/FLOPS_AVX.txt
@@ -11,13 +11,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC0*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC0*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
 -
 Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
 May count non-AVX instructions that employ 256-bit operations, including (but
diff --git a/groups/haswellEP/NUMA.txt b/groups/haswellEP/NUMA.txt
index 0c1b8fb04..41fbe6288 100644
--- a/groups/haswellEP/NUMA.txt
+++ b/groups/haswellEP/NUMA.txt
@@ -20,7 +20,7 @@ Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
 Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
 Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
diff --git a/groups/haswellEP/PORT_USAGE.txt b/groups/haswellEP/PORT_USAGE.txt
index 898cdd159..eb74ffe70 100644
--- a/groups/haswellEP/PORT_USAGE.txt
+++ b/groups/haswellEP/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port1 usage ratio UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port2 usage ratio UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port3 usage ratio UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port4 usage ratio UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port5 usage ratio UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port6 usage ratio UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
-Port7 usage ratio UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/haswellEP/QPI.txt b/groups/haswellEP/QPI.txt
index 4ad0cf8d8..dcdda85c3 100644
--- a/groups/haswellEP/QPI.txt
+++ b/groups/haswellEP/QPI.txt
@@ -31,7 +31,7 @@ QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C
 QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
 
 LONG
-Formula:
+Formulas:
 QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
 QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
 QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
diff --git a/groups/haswellEP/SBOX.txt b/groups/haswellEP/SBOX.txt
index 246deea2f..24f86b6ee 100644
--- a/groups/haswellEP/SBOX.txt
+++ b/groups/haswellEP/SBOX.txt
@@ -18,7 +18,7 @@ Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/ti
 Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32
 
 LONG
-Formula:
+Formulas:
 Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time
 Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32)
 --
diff --git a/groups/haswellEP/TMA.txt b/groups/haswellEP/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/haswellEP/TMA.txt
+++ b/groups/haswellEP/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/haswellEP/UOPS.txt b/groups/haswellEP/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/haswellEP/UOPS.txt
+++ b/groups/haswellEP/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/interlagos/FLOPS_DP.txt b/groups/interlagos/FLOPS_DP.txt
index 27e58c33a..7af248c23 100644
--- a/groups/interlagos/FLOPS_DP.txt
+++ b/groups/interlagos/FLOPS_DP.txt
@@ -9,14 +9,14 @@ PMC3  RETIRED_FLOPS_DOUBLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-DP MFLOP/s    1.0E-06*(PMC3)/time
+DP [MFLOP/s]    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
 -
 Profiling group to measure double precisision FLOP rate.
 
diff --git a/groups/interlagos/FLOPS_SP.txt b/groups/interlagos/FLOPS_SP.txt
index 7db569f08..14af2c231 100644
--- a/groups/interlagos/FLOPS_SP.txt
+++ b/groups/interlagos/FLOPS_SP.txt
@@ -9,14 +9,14 @@ PMC3  RETIRED_FLOPS_SINGLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-SP MFLOP/s    1.0E-06*(PMC3)/time
+SP [MFLOP/s]    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
 -
 Profiling group to measure single precision FLOP rate.
 
diff --git a/groups/interlagos/L2.txt b/groups/interlagos/L2.txt
index 5bf18430f..4d90ef81b 100644
--- a/groups/interlagos/L2.txt
+++ b/groups/interlagos/L2.txt
@@ -14,10 +14,10 @@ Cache refill bandwidth System  [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64
-Cache refill bandwidth system/L2 [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-Cache refill bandwidth system [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
+L2 bandwidth [MBytes/s]   = 1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time
+L2 data volume [GBytes]   = 1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64
+Cache refill bandwidth system/L2 [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
+Cache refill bandwidth system [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cache line loaded from L2 to L1 and the
diff --git a/groups/interlagos/LINKS.txt b/groups/interlagos/LINKS.txt
index 4b8ac22dc..dbf3cd016 100644
--- a/groups/interlagos/LINKS.txt
+++ b/groups/interlagos/LINKS.txt
@@ -15,10 +15,10 @@ Link bandwidth L3 [MBytes/s]  1.0E-06*UPMC3*4.0/time
 
 LONG
 Formulas:
-Link bandwidth L0 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L0_USE*4.0/time
-Link bandwidth L1 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time
-Link bandwidth L2 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time
-Link bandwidth L3 [MBytes/s]  1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time
+Link bandwidth L0 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L0_USE*4.0/time
+Link bandwidth L1 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time
+Link bandwidth L2 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time
+Link bandwidth L3 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time
 -
 Profiling group to measure the HyperTransport link bandwidth for the four links
 of a local node. This indicates the# data flow between different ccNUMA nodes.
diff --git a/groups/interlagos/NUMA.txt b/groups/interlagos/NUMA.txt
index ed13dbe30..79f3618c5 100644
--- a/groups/interlagos/NUMA.txt
+++ b/groups/interlagos/NUMA.txt
@@ -15,10 +15,10 @@ DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
 
 LONG
 Formulas:
-DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
 -
 Profiling group to measure the traffic from local CPU to the different
 DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
diff --git a/groups/interlagos/NUMA_0_3.txt b/groups/interlagos/NUMA_0_3.txt
index ed13dbe30..79f3618c5 100644
--- a/groups/interlagos/NUMA_0_3.txt
+++ b/groups/interlagos/NUMA_0_3.txt
@@ -15,10 +15,10 @@ DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
 
 LONG
 Formulas:
-DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
 -
 Profiling group to measure the traffic from local CPU to the different
 DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
diff --git a/groups/interlagos/NUMA_4_7.txt b/groups/interlagos/NUMA_4_7.txt
index ae164991e..0e05776b1 100644
--- a/groups/interlagos/NUMA_4_7.txt
+++ b/groups/interlagos/NUMA_4_7.txt
@@ -15,10 +15,10 @@ DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
 
 LONG
 Formulas:
-DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+DRAM read/write local to 4 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 5 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 6 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 7 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
 -
 Profiling group to measure the traffic from local CPU to the different
 DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
diff --git a/groups/ivybridge/CLOCK.txt b/groups/ivybridge/CLOCK.txt
index d5e288a0b..fb1910149 100644
--- a/groups/ivybridge/CLOCK.txt
+++ b/groups/ivybridge/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/ivybridge/CYCLE_ACTIVITY.txt b/groups/ivybridge/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/ivybridge/CYCLE_ACTIVITY.txt
+++ b/groups/ivybridge/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/ivybridge/CYCLE_STALLS.txt b/groups/ivybridge/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/ivybridge/CYCLE_STALLS.txt
+++ b/groups/ivybridge/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/ivybridge/DIVIDE.txt b/groups/ivybridge/DIVIDE.txt
index b85de8f23..f8cb0b35b 100644
--- a/groups/ivybridge/DIVIDE.txt
+++ b/groups/ivybridge/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/ivybridge/ENERGY.txt b/groups/ivybridge/ENERGY.txt
index 541c3ad97..92a69157c 100644
--- a/groups/ivybridge/ENERGY.txt
+++ b/groups/ivybridge/ENERGY.txt
@@ -26,7 +26,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/ivybridge/FALSE_SHARE.txt b/groups/ivybridge/FALSE_SHARE.txt
index a87f7d4e6..fbec3f4fb 100644
--- a/groups/ivybridge/FALSE_SHARE.txt
+++ b/groups/ivybridge/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 -
diff --git a/groups/ivybridge/FLOPS_AVX.txt b/groups/ivybridge/FLOPS_AVX.txt
index ea459f4bc..526d5506b 100644
--- a/groups/ivybridge/FLOPS_AVX.txt
+++ b/groups/ivybridge/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on IvyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/ivybridge/FLOPS_DP.txt b/groups/ivybridge/FLOPS_DP.txt
index 73a2f1cf1..e737098c7 100644
--- a/groups/ivybridge/FLOPS_DP.txt
+++ b/groups/ivybridge/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates. Please note that the current
diff --git a/groups/ivybridge/FLOPS_SP.txt b/groups/ivybridge/FLOPS_SP.txt
index c456dfcbe..74837229f 100644
--- a/groups/ivybridge/FLOPS_SP.txt
+++ b/groups/ivybridge/FLOPS_SP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates. Please note that the current
diff --git a/groups/ivybridge/PORT_USAGE.txt b/groups/ivybridge/PORT_USAGE.txt
index 68d66302b..d509607cd 100644
--- a/groups/ivybridge/PORT_USAGE.txt
+++ b/groups/ivybridge/PORT_USAGE.txt
@@ -28,12 +28,12 @@ Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port1 usage ratio UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port2 usage ratio UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port3 usage ratio UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port4 usage ratio UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port5 usage ratio UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/ivybridge/TMA.txt b/groups/ivybridge/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/ivybridge/TMA.txt
+++ b/groups/ivybridge/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/ivybridge/UOPS.txt b/groups/ivybridge/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/ivybridge/UOPS.txt
+++ b/groups/ivybridge/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/ivybridgeEP/CACHES.txt b/groups/ivybridgeEP/CACHES.txt
index dee9a6621..a1e4d13dc 100644
--- a/groups/ivybridgeEP/CACHES.txt
+++ b/groups/ivybridgeEP/CACHES.txt
@@ -109,7 +109,7 @@ L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE
 L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
-Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
 Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
 Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
diff --git a/groups/ivybridgeEP/CBOX.txt b/groups/ivybridgeEP/CBOX.txt
index 6450a2eb1..bc30a646c 100644
--- a/groups/ivybridgeEP/CBOX.txt
+++ b/groups/ivybridgeEP/CBOX.txt
@@ -46,8 +46,8 @@ LLC data written to MEM [MBytes] 1E-6*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+C
 
 LONG
 Formulas:
-LLC misses per instruction sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
-LLC data written to MEM [MBytes] sum(LLC_LOOKUP_ANY:STATE=0x1)*64*1E-6
+LLC misses per instruction = sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY:STATE=0x1)*64*1E-6
 --
 The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
 CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
diff --git a/groups/ivybridgeEP/CLOCK.txt b/groups/ivybridgeEP/CLOCK.txt
index d5e288a0b..fb1910149 100644
--- a/groups/ivybridgeEP/CLOCK.txt
+++ b/groups/ivybridgeEP/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/ivybridgeEP/CYCLE_ACTIVITY.txt b/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
+++ b/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/ivybridgeEP/CYCLE_STALLS.txt b/groups/ivybridgeEP/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/ivybridgeEP/CYCLE_STALLS.txt
+++ b/groups/ivybridgeEP/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/ivybridgeEP/DIVIDE.txt b/groups/ivybridgeEP/DIVIDE.txt
index b85de8f23..f8cb0b35b 100644
--- a/groups/ivybridgeEP/DIVIDE.txt
+++ b/groups/ivybridgeEP/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/ivybridgeEP/ENERGY.txt b/groups/ivybridgeEP/ENERGY.txt
index 07bc59c8f..74c16bbd6 100644
--- a/groups/ivybridgeEP/ENERGY.txt
+++ b/groups/ivybridgeEP/ENERGY.txt
@@ -23,7 +23,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/ivybridgeEP/FALSE_SHARE.txt b/groups/ivybridgeEP/FALSE_SHARE.txt
index 1d0a49e56..5e28a1556 100644
--- a/groups/ivybridgeEP/FALSE_SHARE.txt
+++ b/groups/ivybridgeEP/FALSE_SHARE.txt
@@ -19,7 +19,7 @@ Remote LLC false sharing [MByte] 1.E-06*PMC1*64
 Remote LLC false sharing rate PMC1/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM*64
diff --git a/groups/ivybridgeEP/FLOPS_AVX.txt b/groups/ivybridgeEP/FLOPS_AVX.txt
index 7ca4aca56..0ad669fb0 100644
--- a/groups/ivybridgeEP/FLOPS_AVX.txt
+++ b/groups/ivybridgeEP/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on
 IvyBridge are potentially wrong.
diff --git a/groups/ivybridgeEP/FLOPS_DP.txt b/groups/ivybridgeEP/FLOPS_DP.txt
index 73a2f1cf1..e737098c7 100644
--- a/groups/ivybridgeEP/FLOPS_DP.txt
+++ b/groups/ivybridgeEP/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates. Please note that the current
diff --git a/groups/ivybridgeEP/FLOPS_SP.txt b/groups/ivybridgeEP/FLOPS_SP.txt
index c456dfcbe..74837229f 100644
--- a/groups/ivybridgeEP/FLOPS_SP.txt
+++ b/groups/ivybridgeEP/FLOPS_SP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates. Please note that the current
diff --git a/groups/ivybridgeEP/MEM_DP.txt b/groups/ivybridgeEP/MEM_DP.txt
index b49887c25..eff167771 100644
--- a/groups/ivybridgeEP/MEM_DP.txt
+++ b/groups/ivybridgeEP/MEM_DP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
diff --git a/groups/ivybridgeEP/MEM_SP.txt b/groups/ivybridgeEP/MEM_SP.txt
index 70622dd1d..e541340a5 100644
--- a/groups/ivybridgeEP/MEM_SP.txt
+++ b/groups/ivybridgeEP/MEM_SP.txt
@@ -36,9 +36,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
@@ -48,13 +48,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
diff --git a/groups/ivybridgeEP/NUMA.txt b/groups/ivybridgeEP/NUMA.txt
index 0c1b8fb04..41fbe6288 100644
--- a/groups/ivybridgeEP/NUMA.txt
+++ b/groups/ivybridgeEP/NUMA.txt
@@ -20,7 +20,7 @@ Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
 Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
 Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
diff --git a/groups/ivybridgeEP/PORT_USAGE.txt b/groups/ivybridgeEP/PORT_USAGE.txt
index 68d66302b..d509607cd 100644
--- a/groups/ivybridgeEP/PORT_USAGE.txt
+++ b/groups/ivybridgeEP/PORT_USAGE.txt
@@ -28,12 +28,12 @@ Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port1 usage ratio UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port2 usage ratio UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port3 usage ratio UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port4 usage ratio UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port5 usage ratio UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/ivybridgeEP/QPI.txt b/groups/ivybridgeEP/QPI.txt
index 4dbf8a455..a2f1339a1 100644
--- a/groups/ivybridgeEP/QPI.txt
+++ b/groups/ivybridgeEP/QPI.txt
@@ -38,7 +38,7 @@ QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)
 QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time
 
 LONG
-Formula:
+Formulas:
 QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000)
 QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE)
 data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
diff --git a/groups/ivybridgeEP/TMA.txt b/groups/ivybridgeEP/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/ivybridgeEP/TMA.txt
+++ b/groups/ivybridgeEP/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/ivybridgeEP/UOPS.txt b/groups/ivybridgeEP/UOPS.txt
index 178aec531..e6cc208dc 100644
--- a/groups/ivybridgeEP/UOPS.txt
+++ b/groups/ivybridgeEP/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/k10/FLOPS_DP.txt b/groups/k10/FLOPS_DP.txt
index aa05d77ae..89f0ac215 100644
--- a/groups/k10/FLOPS_DP.txt
+++ b/groups/k10/FLOPS_DP.txt
@@ -8,15 +8,15 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC2*inverseClock
-DP MFLOP/s    1.0E-06*(PMC0+PMC1)/time
-DP Add MFLOP/s    1.0E-06*PMC0/time
-DP Mult MFLOP/s    1.0E-06*PMC1/time
+DP [MFLOP/s]    1.0E-06*(PMC0+PMC1)/time
+DP Add [MFLOP/s]    1.0E-06*PMC0/time
+DP Mult [MFLOP/s]    1.0E-06*PMC1/time
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
-DP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time
-DP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time
+DP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
 -
 Profiling group to measure double SSE FLOPs.
 Don't forget that your code might also execute X87 FLOPs.
diff --git a/groups/k10/FLOPS_SP.txt b/groups/k10/FLOPS_SP.txt
index 8869557f0..590d39acc 100644
--- a/groups/k10/FLOPS_SP.txt
+++ b/groups/k10/FLOPS_SP.txt
@@ -8,15 +8,15 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC2*inverseClock
-SP MFLOP/s  1.0E-06*(PMC0+PMC1)/time
-SP Add MFLOP/s  1.0E-06*PMC0/time
-SP Mult MFLOP/s   1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0+PMC1)/time
+SP Add [MFLOP/s]  1.0E-06*PMC0/time
+SP Mult [MFLOP/s]   1.0E-06*PMC1/time
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
-SP Add MFLOP/s = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time
-SP Mult MFLOP/s = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time
+SP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time
 -
 Profiling group to measure single precision SSE FLOPs.
 Don't forget that your code might also execute X87 FLOPs.
diff --git a/groups/k10/FLOPS_X87.txt b/groups/k10/FLOPS_X87.txt
index 015ee1927..62fbefcea 100644
--- a/groups/k10/FLOPS_X87.txt
+++ b/groups/k10/FLOPS_X87.txt
@@ -9,17 +9,17 @@ PMC3  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] PMC3*inverseClock
-X87 MFLOP/s       1.0E-06*(PMC0+PMC1+PMC2)/time
-X87 Add MFLOP/s    1.0E-06*PMC0/time
-X87 Mult MFLOP/s   1.0E-06*PMC1/time
-X87 Div MFLOP/s    1.0E-06*PMC2/time
+X87 [MFLOP/s]       1.0E-06*(PMC0+PMC1+PMC2)/time
+X87 Add [MFLOP/s]    1.0E-06*PMC0/time
+X87 Mult [MFLOP/s]   1.0E-06*PMC1/time
+X87 Div [MFLOP/s]    1.0E-06*PMC2/time
 
 LONG
 Formulas:
-X87 MFLOP/s = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time
-X87 Add MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_ADD/time
-X87 Mult MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_MULT/time
-X87 Div MFLOP/s = 1.0E-06*X87_FLOPS_RETIRED_DIV/time
+X87 [MFLOP/s] = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time
+X87 Add [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_ADD/time
+X87 Mult [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_MULT/time
+X87 Div [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_DIV/time
 -
 Profiling group to measure X87 FLOP rates.
 
diff --git a/groups/k10/NUMA_0_3.txt b/groups/k10/NUMA_0_3.txt
index bdda6e0b6..66e56d9fe 100644
--- a/groups/k10/NUMA_0_3.txt
+++ b/groups/k10/NUMA_0_3.txt
@@ -15,10 +15,10 @@ Hyper Transport link3 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
 
 LONG
 Formulas:
-Hyper Transport link0 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
-Hyper Transport link1 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
-Hyper Transport link2 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
-Hyper Transport link3 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+Hyper Transport link0 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
 -
 Profiling group to measure the bandwidth over the Hypertransport links. Can be used
 to detect NUMA problems. Usually there should be only limited traffic over the QPI
diff --git a/groups/k10/NUMA_4_7.txt b/groups/k10/NUMA_4_7.txt
index aa10be05f..e13f2b95e 100644
--- a/groups/k10/NUMA_4_7.txt
+++ b/groups/k10/NUMA_4_7.txt
@@ -15,10 +15,10 @@ Hyper Transport link7 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
 
 LONG
 Formulas:
-Hyper Transport link4 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
-Hyper Transport link5 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
-Hyper Transport link6 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
-Hyper Transport link7 bandwidth [MBytes/s]  1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+Hyper Transport link4 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
 -
 Profiling group to measure the bandwidth over the Hypertransport links. Can be used
 to detect NUMA problems. Usually there should be only limited traffic over the QPI
diff --git a/groups/k10/TLB.txt b/groups/k10/TLB.txt
index 2491c8de3..25cab335c 100644
--- a/groups/k10/TLB.txt
+++ b/groups/k10/TLB.txt
@@ -18,12 +18,12 @@ L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
 
 LONG
 Formulas:
-L1 DTLB request rate  DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
-L1 DTLB miss rate  (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
-L1 DTLB miss ratio  (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/DATA_CACHE_ACCESSES
-L2 DTLB request rate  (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
-L2 DTLB miss rate  DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED
-L2 DTLB miss ratio DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)
+L1 DTLB request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+L1 DTLB miss rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
+L1 DTLB miss ratio = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
+L2 DTLB miss rate = DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED
+L2 DTLB miss ratio = DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)
 -
 L1 DTLB request  rate tells you how data intensive your code is
 or how many data accesses you have on average per instruction.
diff --git a/groups/k8/L2.txt b/groups/k8/L2.txt
index c3ad5175e..63b9b7fe4 100644
--- a/groups/k8/L2.txt
+++ b/groups/k8/L2.txt
@@ -15,10 +15,10 @@ L2 evict  [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
-L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2 bandwidth [MBytes/s]   = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes]   = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
+L2 refill bandwidth [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
+L2 evict [MBytes/s]   = 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cache line loaded from L2 to L1 and the
diff --git a/groups/kabini/FLOPS_DP.txt b/groups/kabini/FLOPS_DP.txt
index d6af2e201..1a4e54c8d 100644
--- a/groups/kabini/FLOPS_DP.txt
+++ b/groups/kabini/FLOPS_DP.txt
@@ -9,14 +9,14 @@ PMC3  RETIRED_FLOPS_DOUBLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-DP MFLOP/s    1.0E-06*(PMC3)/time
+DP [MFLOP/s]    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
 CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
 CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
 IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
diff --git a/groups/kabini/FLOPS_SP.txt b/groups/kabini/FLOPS_SP.txt
index 0fe4e5420..f6c08c14b 100644
--- a/groups/kabini/FLOPS_SP.txt
+++ b/groups/kabini/FLOPS_SP.txt
@@ -9,14 +9,14 @@ PMC3  RETIRED_FLOPS_SINGLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-SP MFLOP/s    1.0E-06*(PMC3)/time
+SP [MFLOP/s]    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
 CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
 CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
 IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
diff --git a/groups/kabini/NUMA_0_3.txt b/groups/kabini/NUMA_0_3.txt
index ed13dbe30..79f3618c5 100644
--- a/groups/kabini/NUMA_0_3.txt
+++ b/groups/kabini/NUMA_0_3.txt
@@ -15,10 +15,10 @@ DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
 
 LONG
 Formulas:
-DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
-DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
-DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
-DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
 -
 Profiling group to measure the traffic from local CPU to the different
 DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
diff --git a/groups/kabini/NUMA_4_7.txt b/groups/kabini/NUMA_4_7.txt
index b74488134..7b518db27 100644
--- a/groups/kabini/NUMA_4_7.txt
+++ b/groups/kabini/NUMA_4_7.txt
@@ -15,10 +15,10 @@ DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
 
 LONG
 Formulas:
-DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
-DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
-DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
-DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
+DRAM read/write local to 4 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
+DRAM read/write local to 5 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
+DRAM read/write local to 6 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
+DRAM read/write local to 7 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
 -
 Profiling group to measure the traffic from local CPU to the different
 DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
diff --git a/groups/kabini/TLB.txt b/groups/kabini/TLB.txt
index 707f888a7..f66b3cb90 100644
--- a/groups/kabini/TLB.txt
+++ b/groups/kabini/TLB.txt
@@ -18,12 +18,12 @@ L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
 
 LONG
 Formulas:
-L1 DTLB request rate  DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-L1 DTLB miss rate  (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
-L1 DTLB miss ratio  (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/DATA_CACHE_ACCESSES
-L2 DTLB request rate  (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
-L2 DTLB miss rate  DTLB_MISS_ALL / RETIRED_INSTRUCTIONS
-L2 DTLB miss ratio DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)
+L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate = DTLB_MISS_ALL / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio = DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)
 -
 L1 DTLB request  rate tells you how data intensive your code is
 or how many data accesses you have on average per instruction.
diff --git a/groups/knl/CLOCK.txt b/groups/knl/CLOCK.txt
index 2ddd921ac..8756ef20b 100644
--- a/groups/knl/CLOCK.txt
+++ b/groups/knl/CLOCK.txt
@@ -15,7 +15,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 -
 The Xeon Phi (Knights Landing) implements the new RAPL interface. This interface enables to
diff --git a/groups/knl/DIVIDE.txt b/groups/knl/DIVIDE.txt
index 3c5c395eb..d9b09180f 100644
--- a/groups/knl/DIVIDE.txt
+++ b/groups/knl/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_COUNT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY/CYCLES_DIV_BUSY_COUNT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/knl/ENERGY.txt b/groups/knl/ENERGY.txt
index df8092de4..19ede75f3 100644
--- a/groups/knl/ENERGY.txt
+++ b/groups/knl/ENERGY.txt
@@ -23,7 +23,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/knl/FLOPS_DP.txt b/groups/knl/FLOPS_DP.txt
index af2e24835..88bffe2af 100644
--- a/groups/knl/FLOPS_DP.txt
+++ b/groups/knl/FLOPS_DP.txt
@@ -12,23 +12,23 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s (SSE assumed) 1.0E-06*((PMC1*2.0)+PMC0)/time
-DP MFLOP/s (AVX assumed) 1.0E-06*((PMC1*4.0)+PMC0)/time
-DP MFLOP/s (AVX512 assumed) 1.0E-06*((PMC1*8.0)+PMC0)/time
-Packed MUOPS/s   1.0E-06*(PMC1)/time
-Scalar MUOPS/s 1.0E-06*PMC0/time
+DP [MFLOP/s] (SSE assumed) 1.0E-06*((PMC1*2.0)+PMC0)/time
+DP [MFLOP/s] (AVX assumed) 1.0E-06*((PMC1*4.0)+PMC0)/time
+DP [MFLOP/s] (AVX512 assumed) 1.0E-06*((PMC1*8.0)+PMC0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*PMC0/time
 
 LONG
-Formula:
-DP MFLOP/s (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*2+UOPS_RETIRED_SCALAR_SIMD)/runtime
-DP MFLOP/s (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
-DP MFLOP/s (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
-Packed MUOPS/s = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
-Scalar MUOPS/s = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
+Formulas:
+DP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*2+UOPS_RETIRED_SCALAR_SIMD)/runtime
+DP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
+DP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
+Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
+Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
 -
 AVX/SSE scalar and packed double precision FLOP rates. The Xeon Phi (Knights Landing) provides
 no possibility to differentiate between double and single precision FLOP/s. Therefore, we only
-assume that the printed MFLOP/s value is for double-precision code. Moreover, there is no way
+assume that the printed [MFLOP/s] value is for double-precision code. Moreover, there is no way
 to distinguish between SSE, AVX or AVX512 packed SIMD operations. Therefore, this group prints
-out the MFLOP/s for different SIMD techniques.
+out the [MFLOP/s] for different SIMD techniques.
 WARNING: The events also count for integer arithmetics
diff --git a/groups/knl/FLOPS_SP.txt b/groups/knl/FLOPS_SP.txt
index 750c80868..4a28116a2 100644
--- a/groups/knl/FLOPS_SP.txt
+++ b/groups/knl/FLOPS_SP.txt
@@ -12,19 +12,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s (SSE assumed) 1.0E-06*(PMC1*4.0+PMC0)/time
-SP MFLOP/s (AVX assumed) 1.0E-06*(PMC1*8.0+PMC0)/time
-SP MFLOP/s (AVX512 assumed) 1.0E-06*(PMC1*16.0+PMC0)/time
-Packed MUOPS/s   1.0E-06*(PMC1)/time
-Scalar MUOPS/s 1.0E-06*PMC0/time
+SP [MFLOP/s] (SSE assumed) 1.0E-06*(PMC1*4.0+PMC0)/time
+SP [MFLOP/s] (AVX assumed) 1.0E-06*(PMC1*8.0+PMC0)/time
+SP [MFLOP/s] (AVX512 assumed) 1.0E-06*(PMC1*16.0+PMC0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*PMC0/time
 
 LONG
-Formula:
-SP MFLOP/s (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
-SP MFLOP/s (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
-SP MFLOP/s (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*16+UOPS_RETIRED_SCALAR_SIMD)/runtime
-Packed MUOPS/s = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
-Scalar MUOPS/s = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
+Formulas:
+SP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
+SP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
+SP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*16+UOPS_RETIRED_SCALAR_SIMD)/runtime
+Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
+Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
 -
 AVX/SSE scalar and packed single precision FLOP rates. The Xeon Phi (Knights Landing) provides
 no possibility to differentiate between double and single precision FLOP/s. Therefore, we only
diff --git a/groups/knl/FRONTEND_STALLS.txt b/groups/knl/FRONTEND_STALLS.txt
index 9c725dbb5..1b9f98e7a 100644
--- a/groups/knl/FRONTEND_STALLS.txt
+++ b/groups/knl/FRONTEND_STALLS.txt
@@ -18,7 +18,8 @@ Frontend stall ratio PMC0/FIXC1
 
 LONG
 Formulas:
--
 Frontend stalls = NO_ALLOC_CYCLES_ALL
 Avg. frontend stall duration [cyc] = NO_ALLOC_CYCLES_ALL/NO_ALLOC_CYCLES_ALL_COUNT
 Frontend stall ratio = NO_ALLOC_CYCLES_ALL/CPU_CLK_UNHALTED_CORE
+-
+Frontend stalls
diff --git a/groups/knl/L2.txt b/groups/knl/L2.txt
index bde3c4e0e..4a9370c11 100644
--- a/groups/knl/L2.txt
+++ b/groups/knl/L2.txt
@@ -20,7 +20,7 @@ L2 bandwidth [MBytes/s] 1.E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GByte] 1.E-09*(PMC0+PMC1)*64.0
 
 LONG
-Formula:
+Formulas:
 L2 non-RFO bandwidth [MBytes/s] = 1.E-06*L2_REQUESTS_REFERENCE*64.0/time
 L2 non-RFO data volume [GByte] = 1.E-09*L2_REQUESTS_REFERENCE*64.0
 L2 RFO bandwidth [MBytes/s] = 1.E-06*(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0/time
diff --git a/groups/knl/UOPS_STALLS.txt b/groups/knl/UOPS_STALLS.txt
index 0252857be..97cfa999f 100644
--- a/groups/knl/UOPS_STALLS.txt
+++ b/groups/knl/UOPS_STALLS.txt
@@ -17,9 +17,9 @@ Avg. stall duration [cyc] PMC0/PMC1
 Stall ratio PMC0/FIXC1
 
 LONG
+Formulas:
 Number of stalls = UOPS_RETIRED_STALLS
 Avg. stall duration [cyc] = UOPS_RETIRED_STALLED_CYCLES/UOPS_RETIRED_STALLS
 Stall ratio = UOPS_RETIRED_STALLED_CYCLES/CPU_CLK_UNHALTED_CORE
-Formulas:
 -
 This group measures stalls in the UOP retirement. 
diff --git a/groups/nehalem/DIVIDE.txt b/groups/nehalem/DIVIDE.txt
index e80d3e432..6c172950e 100644
--- a/groups/nehalem/DIVIDE.txt
+++ b/groups/nehalem/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/nehalem/FLOPS_DP.txt b/groups/nehalem/FLOPS_DP.txt
index 01160e6ca..0c2e56c11 100644
--- a/groups/nehalem/FLOPS_DP.txt
+++ b/groups/nehalem/FLOPS_DP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/nehalem/FLOPS_SP.txt b/groups/nehalem/FLOPS_SP.txt
index cae470572..8046cbdb4 100644
--- a/groups/nehalem/FLOPS_SP.txt
+++ b/groups/nehalem/FLOPS_SP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/nehalem/FLOPS_X87.txt b/groups/nehalem/FLOPS_X87.txt
index a4176f046..39cd8b49d 100644
--- a/groups/nehalem/FLOPS_X87.txt
+++ b/groups/nehalem/FLOPS_X87.txt
@@ -11,8 +11,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
 Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/nehalemEX/DIVIDE.txt b/groups/nehalemEX/DIVIDE.txt
index 29cee22fa..725ca841e 100644
--- a/groups/nehalemEX/DIVIDE.txt
+++ b/groups/nehalemEX/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_CYCLES_DIV_BUSY:EDGEDETECT
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_CYCLES_DIV_BUSY:EDGEDETECT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/nehalemEX/FLOPS_DP.txt b/groups/nehalemEX/FLOPS_DP.txt
index 01160e6ca..0c2e56c11 100644
--- a/groups/nehalemEX/FLOPS_DP.txt
+++ b/groups/nehalemEX/FLOPS_DP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/nehalemEX/FLOPS_SP.txt b/groups/nehalemEX/FLOPS_SP.txt
index cae470572..8046cbdb4 100644
--- a/groups/nehalemEX/FLOPS_SP.txt
+++ b/groups/nehalemEX/FLOPS_SP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/nehalemEX/FLOPS_X87.txt b/groups/nehalemEX/FLOPS_X87.txt
index a4176f046..39cd8b49d 100644
--- a/groups/nehalemEX/FLOPS_X87.txt
+++ b/groups/nehalemEX/FLOPS_X87.txt
@@ -11,8 +11,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
 Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/nehalemEX/SCHEDULER.txt b/groups/nehalemEX/SCHEDULER.txt
index 237fcb8af..0e43cce9a 100644
--- a/groups/nehalemEX/SCHEDULER.txt
+++ b/groups/nehalemEX/SCHEDULER.txt
@@ -17,7 +17,7 @@ Ratio Port 1  PMC1/PMC0
 Ratio Port 5  PMC2/PMC0
 
 LONG
-Forumlas:
+Formulas:
 Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
 Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
 -
diff --git a/groups/pentiumm/CPI.txt b/groups/pentiumm/CPI.txt
index 1df7ff8d1..ae4aa26cf 100644
--- a/groups/pentiumm/CPI.txt
+++ b/groups/pentiumm/CPI.txt
@@ -11,8 +11,8 @@ IPC   PMC0/PMC1
 
 LONG
 Formulas:
-CPI   CPU_CLK_UNHALTED/UOPS_RETIRED
-IPC   UOPS_RETIRED/CPU_CLK_UNHALTED
+CPI  = CPU_CLK_UNHALTED/UOPS_RETIRED
+IPC  = UOPS_RETIRED/CPU_CLK_UNHALTED
 -
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
diff --git a/groups/pentiumm/FLOPS_DP.txt b/groups/pentiumm/FLOPS_DP.txt
index 6e8568aab..058a64e14 100644
--- a/groups/pentiumm/FLOPS_DP.txt
+++ b/groups/pentiumm/FLOPS_DP.txt
@@ -6,15 +6,15 @@ PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP
 
 METRICS
 Runtime (RDTSC) [s] time
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*(PMC0)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 
 LONG
-Formula:
-DP MFLOP/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
-Packed MUOPS/s = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time
-Scalar MUOPS/s = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time
+Formulas:
+DP [MFLOP/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
+Packed [MUOPS/s] = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time
+Scalar [MUOPS/s] = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time
 -
 SSE scalar and packed double precision FLOP rates.
 
diff --git a/groups/pentiumm/FLOPS_SP.txt b/groups/pentiumm/FLOPS_SP.txt
index f3ae15ace..d70b835d9 100644
--- a/groups/pentiumm/FLOPS_SP.txt
+++ b/groups/pentiumm/FLOPS_SP.txt
@@ -6,13 +6,13 @@ PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP
 
 METRICS
 Runtime (RDTSC) [s] time
-SP MFLOP/s  1.0E-06*(PMC0)/time
-Scalar MUOPS/s 1.0E-06*(PMC1)/time
+SP [MFLOP/s]  1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC1)/time
 
 LONG
-Formula:
-SP MFLOP/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
-Scalar MUOPS/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
+Formulas:
+SP [MFLOP/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
+Scalar [MUOPS/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
 -
 SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/phi/VECTOR.txt b/groups/phi/VECTOR.txt
index fd2e27fd9..b6ec6a66f 100644
--- a/groups/phi/VECTOR.txt
+++ b/groups/phi/VECTOR.txt
@@ -9,7 +9,7 @@ Runtime (RDTSC) [s] time
 Vectorization intensity PMC1/PMC0
 
 LONG
-Formula:
+Formulas:
 Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED
 -
 Vector instructions include instructions that perform floating-point
diff --git a/groups/phi/VECTOR2.txt b/groups/phi/VECTOR2.txt
index 78e6b82a9..52b3c5915 100644
--- a/groups/phi/VECTOR2.txt
+++ b/groups/phi/VECTOR2.txt
@@ -10,6 +10,7 @@ Runtime unhalted [s]  PMC1*inverseClock
 VPU stall ratio [%] 100*(VPU_STALL_REG/PMC0)
 
 LONG
+Formulas:
 VPU stall ratio [%] = 100*(VPU_STALL_REG/VPU_INSTRUCTIONS_EXECUTED)
 --
 This group measures how efficient the processor works with
diff --git a/groups/phi/VPU_READ_MISS_RATIO.txt b/groups/phi/VPU_READ_MISS_RATIO.txt
index 502644a25..cf04c5f8d 100644
--- a/groups/phi/VPU_READ_MISS_RATIO.txt
+++ b/groups/phi/VPU_READ_MISS_RATIO.txt
@@ -9,7 +9,7 @@ Runtime (RDTSC) [s] time
 VPU read miss ratio PMC1/PMC0
 
 LONG
-Formula:
+Formulas:
 VPU read miss ratio = PMC1/PMC0
 --
 This performance group determines the ratio between reads and reads that miss
diff --git a/groups/phi/VPU_WRITE_MISS_RATIO.txt b/groups/phi/VPU_WRITE_MISS_RATIO.txt
index b098b6f2f..cebf3c767 100644
--- a/groups/phi/VPU_WRITE_MISS_RATIO.txt
+++ b/groups/phi/VPU_WRITE_MISS_RATIO.txt
@@ -9,7 +9,7 @@ Runtime (RDTSC) [s] time
 VPU write miss ratio PMC1/PMC0
 
 LONG
-Formula:
+Formulas:
 VPU write miss ratio = PMC1/PMC0
 --
 This performance group determines the ratio between writes and writes that miss
diff --git a/groups/power8/BRANCH.txt b/groups/power8/BRANCH.txt
new file mode 100644
index 000000000..870bb9d36
--- /dev/null
+++ b/groups/power8/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  PM_BR_PRED_BR_CMPL
+PMC1  PM_BR_PRED_CCACHE_CMPL
+PMC2  PM_BR_PRED_CR_CMPL
+PMC3  PM_BR_MPRED_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Branch rate   (PMC0+PMC1+PMC2)/PMC4
+Branch misprediction rate  PMC3/PMC4
+Branch misprediction ratio  PMC4/(PMC0+PMC1+PMC2)
+Instructions per branch  PMC4/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/power8/CPISTACK1.txt b/groups/power8/CPISTACK1.txt
new file mode 100644
index 000000000..50567234d
--- /dev/null
+++ b/groups/power8/CPISTACK1.txt
@@ -0,0 +1,28 @@
+SHORT First level of IBM CPI stack 
+
+EVENTSET
+PMC0  PM_CMPLU_STALL_THRD 
+PMC1  PM_GCT_EMPTY_CYC
+PMC3  PM_CMPLU_STALL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC3
+Stall cycle ratio PMC3/PMC5
+Thread blocked cycles PMC0
+Thread blocked cycle ratio PMC0/PMC5
+GCT empty cycles PMC1
+GCT empty cycle ratio PMC1/PM5
+
+
+
+
+LONG
+--
+First level of IBM CPI stack. IBM names Stalled Cycles, Waiting to Complete,
+Thread Blocked, Completion Table Empty, Other and Completion Cycles. For some
+there are no clearly identifiable events, so this group concentrates on
+Stalled Cycles (PM_CMPLU_STALL), Thread Blocked (PM_CMPLU_STALL_THRD),
+Completion Table Empty (PM_GCT_EMPTY_CYC) and Other (PM_CMPLU_STALL_OTHER_CMPL).
diff --git a/groups/power8/DATA.txt b/groups/power8/DATA.txt
new file mode 100644
index 000000000..bc3b893fa
--- /dev/null
+++ b/groups/power8/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  PM_LD_CMPL
+PMC1  PM_ST_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC4
+Store ratio PMC1/PMC4
+
+LONG
+Formulas:
+Load to store ratio = PM_LD_CMPL/PM_ST_CMPL
+Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL
+Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/power8/FLOPS_1_2.txt b/groups/power8/FLOPS_1_2.txt
new file mode 100644
index 000000000..93154b068
--- /dev/null
+++ b/groups/power8/FLOPS_1_2.txt
@@ -0,0 +1,18 @@
+SHORT Group 121 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_1FLOP
+PMC1  PM_VSU1_1FLOP
+PMC2  PM_VSU0_2FLOP
+PMC3  PM_VSU1_2FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+One FLOP ops PMC0+PMC1
+Two FLOPs ops PMC2+PMC3
+MFLOP/s  1E-6*(PMC0+PMC1+((PMC2+PMC3)*2))/time
+
+LONG
+Group 121 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/groups/power8/FLOPS_4_8.txt b/groups/power8/FLOPS_4_8.txt
new file mode 100644
index 000000000..425a22abd
--- /dev/null
+++ b/groups/power8/FLOPS_4_8.txt
@@ -0,0 +1,18 @@
+SHORT Group 122 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_4FLOP
+PMC1  PM_VSU1_4FLOP
+PMC2  PM_VSU0_8FLOP
+PMC3  PM_VSU1_8FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+Four FLOPs ops PMC0+PMC1
+Eight FLOPs ops PMC2+PMC3
+MFLOP/s 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*8.0))/time
+
+LONG
+Group 122 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/groups/power8/FLOPS_DP.txt b/groups/power8/FLOPS_DP.txt
new file mode 100644
index 000000000..dcbbc83ed
--- /dev/null
+++ b/groups/power8/FLOPS_DP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  VSU0_SCALAR_DP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*((PMC0*2.0)+PMC2+(PMC1*4.0))/time
+VSX MFlops/s  1.0E-06*((PMC1*4.0)+(PMC0*2.0))/time
+Packed MUOPS/s   1.0E-06*(PMC1)/time
+Scalar MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+-
+
diff --git a/groups/power8/FLOPS_DP2.txt b/groups/power8/FLOPS_DP2.txt
new file mode 100644
index 000000000..a3007526c
--- /dev/null
+++ b/groups/power8/FLOPS_DP2.txt
@@ -0,0 +1,26 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*(PMC0+PMC2+(PMC1)*4.0)/time
+AVX MFlops/s  1.0E-06*((PMC1)*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC1)/time
+Scalar MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+-
+
diff --git a/groups/power8/FLOPS_FMA.txt b/groups/power8/FLOPS_FMA.txt
new file mode 100644
index 000000000..8ab2f5b59
--- /dev/null
+++ b/groups/power8/FLOPS_FMA.txt
@@ -0,0 +1,20 @@
+SHORT Group 124 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_DP_FMA
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU0_FMA
+PMC3  PM_VSU1_FMA
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+DP FMAs PMC0+PMC1
+Scalar FMAs PMC2+PMC3
+DP FMA MFLOP/s 1E-6*(PMC0+PMC1)*4.0/time
+Scalar FMA MFLOP/s 1E-6*(PMC2+PMC3)*2.0/time
+MFLOP/s 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*2.0))/time
+
+LONG
+Group 124 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/groups/power8/FLOPS_SP.txt b/groups/power8/FLOPS_SP.txt
new file mode 100644
index 000000000..6d11df06e
--- /dev/null
+++ b/groups/power8/FLOPS_SP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_SINGLE
+PMC1  PM_VSU0_VECTOR_SP_ISSUED
+PMC2  PM_VSU1_SINGLE
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*(((PMC0-PMC1)+(PMC2-PMC3))*4.0+(PMC1+PMC3)*8.0)/time
+AVX MFlops/s  1.0E-06*((PMC1+PMC3)*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC1+PMC3)/time
+Scalar MUOPS/s 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE)/runtime
+-
+
diff --git a/groups/power8/FLOPS_VSU0.txt b/groups/power8/FLOPS_VSU0.txt
new file mode 100644
index 000000000..1c998a788
--- /dev/null
+++ b/groups/power8/FLOPS_VSU0.txt
@@ -0,0 +1,25 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 0
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  PM_VSU0_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+AVX MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+-
+
diff --git a/groups/power8/FLOPS_VSU1.txt b/groups/power8/FLOPS_VSU1.txt
new file mode 100644
index 000000000..f693c6571
--- /dev/null
+++ b/groups/power8/FLOPS_VSU1.txt
@@ -0,0 +1,25 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 1
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC3  PM_VSU1_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+AVX MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+-
+
diff --git a/groups/power8/FLOPS_VSX.txt b/groups/power8/FLOPS_VSX.txt
new file mode 100644
index 000000000..ba63c243b
--- /dev/null
+++ b/groups/power8/FLOPS_VSX.txt
@@ -0,0 +1,28 @@
+SHORT Vectorized MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_VECTOR_DP_ISSUED
+PMC1  PM_VSU1_VECTOR_DP_ISSUED
+PMC2  PM_VSU0_VECTOR_SP_ISSUED
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s  1.0E-06*((PMC0+PMC1)*4.0+(PMC2+PMC3)*8.0)/time
+DP MFlops/s  1.0E-06*((PMC0+PMC1)*4.0)/time
+SP MFlops/s  1.0E-06*((PMC2+PMC3)*8.0)/time
+DP MIOPS/s   1.0E-06*(PMC0+PMC1)/time
+SP MIOPS/s   1.0E-06*(PMC2+PMC3)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+AVX MFlops/s = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+Packed MUOPS/s = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+Scalar MUOPS/s = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE)/runtime
+-
+
diff --git a/groups/power8/ICACHE.txt b/groups/power8/ICACHE.txt
new file mode 100644
index 000000000..7a07fd4f7
--- /dev/null
+++ b/groups/power8/ICACHE.txt
@@ -0,0 +1,22 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_INST_FROM_L1
+PMC1  PM_L1_ICACHE_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1I request rate PMC0/PMC4
+L1I miss rate PMC1/PMC4
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/power8/L1.txt b/groups/power8/L1.txt
new file mode 100644
index 000000000..19dc36e45
--- /dev/null
+++ b/groups/power8/L1.txt
@@ -0,0 +1,33 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_LD_REF_L1 
+PMC1  PM_ST_CMPL
+PMC2  PM_LSU_L1_PREF
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*((PMC0+PMC2)/2)*64.0/time
+L2D load data volume [GBytes]  1.0E-09*((PMC0+PMC2)/2)*64.0
+L2D store bandwidth [MBytes/s]  1.0E-06*((PMC1/2))*64.0/time
+L2D store data volume [GBytes]  1.0E-09*((PMC1/2))*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC1+PMC0+PMC2)/2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC1+PMC0+PMC2)/2)*64.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_MISS_L1)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_MISS_L1)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0/time
+L2 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/groups/power8/L2.txt b/groups/power8/L2.txt
new file mode 100644
index 000000000..d5af584ac
--- /dev/null
+++ b/groups/power8/L2.txt
@@ -0,0 +1,32 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_ST 
+PMC2  PM_LD_MISS_L1
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*(PMC2/2)*128.0/time
+L2D load data volume [GBytes]  1.0E-09*(PMC2/2)*128.0
+L2D store bandwidth [MBytes/s]  1.0E-06*(PMC0/2)*128.0/time
+L2D store data volume [GBytes]  1.0E-09*(PMC0/2)*128.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC0+PMC2)/2)*128.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC0+PMC2)/2)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_CMPL/2)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_CMPL/2)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0/time
+L2 data volume [GBytes] = 1.0E-09*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/groups/power8/L2CACHE.txt b/groups/power8/L2CACHE.txt
new file mode 100644
index 000000000..47bcedd5f
--- /dev/null
+++ b/groups/power8/L2CACHE.txt
@@ -0,0 +1,40 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_L2_ST_MISS
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate = (PMC2+PMC3)/PMC4
+L2 miss rate = (PMC0+PMC1)/PMC4
+L2 miss ratio = (PMC0+PMC1)/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 miss rate = (PM_L2_LD_MISS+PM_L2_ST_MISS)/PM_RUN_INST_CMPL
+L2 miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 load request rate = PM_L2_LD_DISP/PM_RUN_INST_CMPL
+L2 store request rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss rate = PM_L2_LD_MISS/PM_RUN_INST_CMPL
+L2 store miss rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss ratio = PM_L2_LD_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 store miss ratio = PM_L2_ST_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/power8/L2CACHE.txt~ b/groups/power8/L2CACHE.txt~
new file mode 100644
index 000000000..920f32a1f
--- /dev/null
+++ b/groups/power8/L2CACHE.txt~
@@ -0,0 +1,39 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_L2_ST_MISS
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate = (PMC2+PMC3)/PMC4
+L2 miss rate = (PMC0+PMC1)/PMC4
+L2 miss ratio = (PMC0+PMC1)/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 miss rate = (PM_L2_LD_MISS+PM_L2_ST_MISS)/PM_RUN_INST_CMPL
+L2 miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 load request rate = PM_L2_LD_DISP/PM_RUN_INST_CMPL
+L2 store request rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss rate = PM_L2_LD_MISS/PM_RUN_INST_CMPL
+L2 store miss rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/power8/L3.txt b/groups/power8/L3.txt
new file mode 100644
index 000000000..0737c444e
--- /dev/null
+++ b/groups/power8/L3.txt
@@ -0,0 +1,31 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_LD_PREF
+PMC3  PM_DATA_FROM_L3
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L3D load bandwidth [MBytes/s]  1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3D load data volume [GBytes]  1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+Loads from local L3 per cycle 100.0*(PMC3+(PMC0-PMC3))/PMC5
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time
+L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0
+L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time
+L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time
+L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/groups/power8/MEM.txt b/groups/power8/MEM.txt
new file mode 100644
index 000000000..cb70a9f72
--- /dev/null
+++ b/groups/power8/MEM.txt
@@ -0,0 +1,28 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_CO_MEPF
+PMC1  PM_DATA_ALL_FROM_MEMORY 
+PMC3  PM_L3_PF_ON_CHIP_MEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+Memory evict bandwidth [MBytes/s] 1.0E-06*(PMC0)*128.0/time
+Memory evict data volume [GBytes] 1.0E-09*(PMC0)*128.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3+PMC0)*128.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC1+PMC3+PMC0)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
diff --git a/groups/power8/NUMA.txt b/groups/power8/NUMA.txt
new file mode 100644
index 000000000..65dece37b
--- /dev/null
+++ b/groups/power8/NUMA.txt
@@ -0,0 +1,27 @@
+SHORT Memory bandwidth in MBytes/s for local and remote memory
+
+EVENTSET
+PMC1  PM_DATA_ALL_FROM_LMEM 
+PMC3  PM_DATA_ALL_FROM_DMEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Local bandwidth [MBytes/s] 1.0E-06*(PMC1)*128.0/time
+Local data volume [GBytes] 1.0E-09*(PMC1)*128.0
+Remote bandwidth [MBytes/s] 1.0E-06*(PMC3)*128.0/time
+Remote data volume [GBytes] 1.0E-09*(PMC3)*128.0
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
diff --git a/groups/power8/STALLS1.txt b/groups/power8/STALLS1.txt
new file mode 100644
index 000000000..44d3971ea
--- /dev/null
+++ b/groups/power8/STALLS1.txt
@@ -0,0 +1,23 @@
+SHORT Completion stalls (group 1)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL_THRD
+PMC1 PM_CMPLU_STALL_DCACHE_MISS
+PMC2 PM_CMPLU_STALL_COQ_FULL
+PMC3 PM_CMPLU_STALL
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+Runtime time
+CPI  PMC5/PMC4
+Completion stall cycles PMC3
+Stall cycles by thread conflict PMC0
+Stall ratio by thread conflict [%] PMC0/PMC3*100
+Stall cycles by d-cache miss PMC1
+Stall ratio by d-cache miss [%] PMC1/PMC3*100
+Stall cycles by full castout queue PMC2
+Stall ratio by full castout queue [%] PMC2/PMC3*100
+
+
+LONG
diff --git a/groups/power8/STALLS2.txt b/groups/power8/STALLS2.txt
new file mode 100644
index 000000000..5058910d1
--- /dev/null
+++ b/groups/power8/STALLS2.txt
@@ -0,0 +1,19 @@
+SHORT Completion stalls (group 2)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL
+PMC1 PM_CMPLU_STALL_LSU
+PMC2 PM_CMPLU_STALL_FLUSH
+PMC3 PM_CMPLU_STALL_BRU
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC0
+Stall cycles by load/store unit PMC1
+Stall cycles by pipeline flush PMC2
+Stall cycles by branch unit PMC3
+
+
+LONG
diff --git a/groups/power8/TLB_DATA.txt b/groups/power8/TLB_DATA.txt
new file mode 100644
index 000000000..8cd5bd543
--- /dev/null
+++ b/groups/power8/TLB_DATA.txt
@@ -0,0 +1,29 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+PMC0  PM_DTLB_MISS_16G
+PMC1  PM_DTLB_MISS_4K
+PMC2  PM_DTLB_MISS_64K
+PMC3  PM_DTLB_MISS_16M
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 DTLB 4K misses     PMC1
+L1 DTLB 4K miss rate  PMC1/PMC4
+L1 DTLB 64K misses     PMC2
+L1 DTLB 64K miss rate  PMC2/PMC4
+L1 DTLB 16M misses     PMC3
+L1 DTLB 16M miss rate  PMC3/PMC4
+L1 DTLB 16G misses     PMC0
+L1 DTLB 16G miss rate  PMC0/PMC4
+
+LONG
+Formulas:
+
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/power8/TLB_INSTR.txt b/groups/power8/TLB_INSTR.txt
new file mode 100644
index 000000000..b7690bb23
--- /dev/null
+++ b/groups/power8/TLB_INSTR.txt
@@ -0,0 +1,20 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC2  PM_ITLB_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 ITLB misses     PMC2
+L1 ITLB miss rate  PMC2/PMC4
+
+LONG
+Formulas:
+
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/power8/USEFUL.txt b/groups/power8/USEFUL.txt
new file mode 100644
index 000000000..0c87c92d5
--- /dev/null
+++ b/groups/power8/USEFUL.txt
@@ -0,0 +1,18 @@
+SHORT Rate of useful instructions
+
+EVENTSET
+PMC1  PM_INST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+PURR  PURR_STATE
+SPURR SPURR_STATE
+
+METRICS
+CPI  PMC5/PMC4
+Useful instr. rate PMC4/PMC1*100.0
+Processor Utilization SPURR/PURR
+
+
+LONG
+--
+
diff --git a/groups/power9/BRANCH.txt b/groups/power9/BRANCH.txt
new file mode 100644
index 000000000..1f6dd0d56
--- /dev/null
+++ b/groups/power9/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC1  PM_BR_PRED
+PMC2 PM_IOPS_CMPL
+PMC3  PM_BR_MPRED_CMPL
+PMC4 PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Branch rate   (PMC1)/PMC4
+Branch misprediction rate  PMC3/PMC4
+Branch misprediction ratio  PMC3/(PMC1)
+Instructions per branch  PMC4/(PMC1)
+Operations per branch PMC2/PMC1
+
+LONG
+Formulas:
+Branch rate = PM_BR_PRED/PM_RUN_INST_CMPL
+Branch misprediction rate =  PM_BR_MPRED_CMPL/PM_RUN_INST_CMPL
+Branch misprediction ratio = PM_BR_MPRED_CMPL/PM_BR_PRED
+Instructions per branch = PM_RUN_INST_CMPL/PM_BR_PRED
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/power9/DATA.txt b/groups/power9/DATA.txt
new file mode 100644
index 000000000..a8a7cae00
--- /dev/null
+++ b/groups/power9/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC3  PM_LD_CMPL
+PMC1  PM_ST_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Load to store ratio PMC3/PMC1
+Load rate PMC3/PMC4
+Store rate PMC1/PMC4
+
+LONG
+Formulas:
+Load to store ratio = PM_LD_CMPL/PM_ST_CMPL
+Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL
+Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/power9/FLOPS.txt b/groups/power9/FLOPS.txt
new file mode 100644
index 000000000..fadf81d06
--- /dev/null
+++ b/groups/power9/FLOPS.txt
@@ -0,0 +1,25 @@
+SHORT SP/DP scalar/vector MFlops/s
+
+EVENTSET
+PMC3  PM_FLOP_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s (SP/DP scalar assumed) 1.0E-06*PMC3*2.0/time
+MFlops/s (SP vector assumed) 1.0E-06*PMC3*8.0/time
+MFlops/s (DP vector assumed) 1.0E-06*PMC3*4.0/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s (SP/DP scalar assumed) = 1.0E-06*PM_FLOP_CMPL*2.0/runtime
+MFlops/s (SP vector assumed) = 1.0E-06*PM_FLOP_CMPL*8.0/runtime
+MFlops/s (DP vector assumed) = 1.0E-06*PM_FLOP_CMPL*4.0/runtime
+-
+This group counts floating-point operations. All is derived out of a
+single event PM_FLOP_CMPL, so if you have mixed usage of SP or DP and
+scalar and vector operations, the count won't be exact. With pure codes
+the counts are pretty accurate (e.g. when using likwid-bench).
diff --git a/groups/power9/FLOPS_FMA.txt b/groups/power9/FLOPS_FMA.txt
new file mode 100644
index 000000000..53bec7750
--- /dev/null
+++ b/groups/power9/FLOPS_FMA.txt
@@ -0,0 +1,21 @@
+SHORT Floating-point operations with scalar FMA instuctions
+
+EVENTSET
+PMC3  PM_FMA_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI PMC5/PMC4
+Scalar FMAs PMC3
+Scalar FMA MFLOP/s 1E-6*(PMC3)*2.0/time
+
+LONG
+Formulas:
+Scalar FMAs = PM_FMA_CMPL
+Scalar FMA MFLOP/s = 1E-6*(PM_FMA_CMPL)*2.0/runtime
+-
+This groups counts scalar FMA operations.
+PM_FMA_CMPL: Two-flops instruction completed (fmadd, fnmadd, fmsub,
+fnmsub). Scalar instructions only.
diff --git a/groups/power9/FLOPS_VSX.txt b/groups/power9/FLOPS_VSX.txt
new file mode 100644
index 000000000..c63d0a042
--- /dev/null
+++ b/groups/power9/FLOPS_VSX.txt
@@ -0,0 +1,23 @@
+SHORT Vectorized MFlops/s
+
+EVENTSET
+PMC1  PM_VSU_FIN
+PMC3  PM_VECTOR_FLOP_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+MFlops/s (SP assumed)  1.0E-06*(PMC3*8.0)/time
+MFlops/s (DP assumed)  1.0E-06*(PMC3*4.0)/time
+Vector MIOPS/s   1.0E-06*(PMC1)/time
+
+LONG
+Formula:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+MFlops/s (SP assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*4)/runtime
+MFlops/s (DP assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*8)/runtime
+Vector MIOPS/s = 1.0E-06*(PM_VECTOR_FLOP_CMPL)/runtime
+-
+
diff --git a/groups/power9/ICACHE.txt b/groups/power9/ICACHE.txt
new file mode 100644
index 000000000..7a07fd4f7
--- /dev/null
+++ b/groups/power9/ICACHE.txt
@@ -0,0 +1,22 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_INST_FROM_L1
+PMC1  PM_L1_ICACHE_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1I request rate PMC0/PMC4
+L1I miss rate PMC1/PMC4
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/power9/L2CACHE.txt b/groups/power9/L2CACHE.txt
new file mode 100644
index 000000000..98732512b
--- /dev/null
+++ b/groups/power9/L2CACHE.txt
@@ -0,0 +1,33 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate (PMC2+PMC3)/PMC4
+L2 load miss rate PMC1/PMC4
+L2 load miss ratio PMC1/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 load miss rate = (PM_L2_LD_MISS)/PM_RUN_INST_CMPL
+L2 load miss ratio = (PM_L2_LD_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 load miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/power9/L2LOAD.txt b/groups/power9/L2LOAD.txt
new file mode 100644
index 000000000..ebac5a2dc
--- /dev/null
+++ b/groups/power9/L2LOAD.txt
@@ -0,0 +1,23 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_LD
+PMC2  PM_L2_INST
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 load bandwidth [MBytes/s]  1.0E-06*(PMC0+PMC2)*128.0/time
+L2 load data volume [GBytes]  1.0E-09*(PMC0+PMC2)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_LD+PM_L2_INST)*128.0/time
+L2 load data volume [GBytes] = 1.0E-09*(PM_L2_LD+PM_L2_INST)*128.0
+-
+Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from L2 cache to L1.
diff --git a/groups/power9/L2STORE.txt b/groups/power9/L2STORE.txt
new file mode 100644
index 000000000..3b1c0afd8
--- /dev/null
+++ b/groups/power9/L2STORE.txt
@@ -0,0 +1,22 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_ST
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 store bandwidth [MBytes/s]  1.0E-06*(PMC0)*128.0/time
+L2 store data volume [GBytes]  1.0E-09*(PMC0)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_ST)*128.0/time
+L2 load data volume [GBytes] = 1.0E-09*(PM_L2_ST)*128.0
+-
+Profiling group to measure L2 store cache bandwidth. The bandwidth is computed by the
+number of cacheline stored from L1 cache to L2.
diff --git a/groups/power9/L3.txt b/groups/power9/L3.txt
new file mode 100644
index 000000000..cb97ead86
--- /dev/null
+++ b/groups/power9/L3.txt
@@ -0,0 +1,29 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_LD_PREF
+PMC3  PM_DATA_FROM_L3
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L3D load bandwidth [MBytes/s]  1.0E-06*(PMC3+PMC0)*128.0/time
+L3D load data volume [GBytes]  1.0E-09*(PMC3+PMC0)*128.0
+Loads from local L3 per cycle 100.0*(PMC3+PMC0)/PMC5
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time
+L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0
+L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time
+L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time
+L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/groups/power9/MEM.txt b/groups/power9/MEM.txt
new file mode 100644
index 000000000..022d39d14
--- /dev/null
+++ b/groups/power9/MEM.txt
@@ -0,0 +1,47 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+MBOX0C0 PM_MBA0_READ_BYTES
+MBOX0C1 PM_MBA0_WRITE_BYTES
+MBOX1C0 PM_MBA1_READ_BYTES
+MBOX1C1 PM_MBA1_WRITE_BYTES
+MBOX2C0 PM_MBA2_READ_BYTES
+MBOX2C1 PM_MBA2_WRITE_BYTES
+MBOX3C0 PM_MBA3_READ_BYTES
+MBOX3C1 PM_MBA3_WRITE_BYTES
+MBOX4C0 PM_MBA4_READ_BYTES
+MBOX4C1 PM_MBA4_WRITE_BYTES
+MBOX5C0 PM_MBA5_READ_BYTES
+MBOX5C1 PM_MBA5_WRITE_BYTES
+MBOX6C0 PM_MBA6_READ_BYTES
+MBOX6C1 PM_MBA6_WRITE_BYTES
+MBOX7C0 PM_MBA7_READ_BYTES
+MBOX7C1 PM_MBA7_WRITE_BYTES
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_WRITE_BYTES))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
diff --git a/groups/power9/TLB_DATA.txt b/groups/power9/TLB_DATA.txt
new file mode 100644
index 000000000..3d77654f8
--- /dev/null
+++ b/groups/power9/TLB_DATA.txt
@@ -0,0 +1,42 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+PMC0  PM_LSU_DTLB_MISS_16G_1G
+PMC1  PM_LSU_DTLB_MISS_4K
+PMC2  PM_LSU_DTLB_MISS_64K
+PMC3  PM_LSU_DTLB_MISS_16M_2M
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 DTLB 4K misses     PMC1
+L1 DTLB 4K miss rate  PMC1/PMC4
+L1 DTLB 4K miss ratio [%] (PMC1/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 64K misses     PMC2
+L1 DTLB 64K miss rate  PMC2/PMC4
+L1 DTLB 64K miss ratio [%] (PMC2/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 16M/2M misses     PMC3
+L1 DTLB 16M/2M miss rate  PMC3/PMC4
+L1 DTLB 16M/2M miss ratio [%] (PMC3/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 16G/1G misses     PMC0
+L1 DTLB 16G/1G miss rate  PMC0/PMC4
+L1 DTLB 16G/1G miss ratio [%] (PMC0/(PMC0+PMC1+PMC2+PMC3))*100.0
+
+LONG
+Formulas:
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 64K misses = PM_LSU_DTLB_MISS_64K
+L1 DTLB 64K miss rate = PM_LSU_DTLB_MISS_64K/PM_RUN_INST_CMPL
+L1 DTLB 64K miss ratio [%] = (PM_LSU_DTLB_MISS_64K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+-
+This group measures the data TLB misses for different page sizes.
diff --git a/groups/power9/TLB_INSTR.txt b/groups/power9/TLB_INSTR.txt
new file mode 100644
index 000000000..dc99d8a2b
--- /dev/null
+++ b/groups/power9/TLB_INSTR.txt
@@ -0,0 +1,21 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC3  PM_ITLB_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 ITLB misses     PMC3
+L1 ITLB miss rate  PMC3/PMC4
+
+LONG
+Formulas:
+L1 ITLB misses = PM_ITLB_MISS
+L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL
+-
+This group measures the reloads of the instruction TLB.
+Misses to the HPT are counted once while misses in the Radix
+tree count the number of tree levels traversed.
diff --git a/groups/power9/USEFUL.txt b/groups/power9/USEFUL.txt
new file mode 100644
index 000000000..bbc20a016
--- /dev/null
+++ b/groups/power9/USEFUL.txt
@@ -0,0 +1,22 @@
+SHORT Rate of useful instructions
+
+EVENTSET
+PMC0  PM_RUN_SPURR
+PMC1  PM_INST_DISP
+PMC3  PM_RUN_PURR
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Useful instr. rate [%] (PMC4/PMC1)*100.0
+Processor Utilization [%] (PMC0/PMC3)*100.0
+
+
+LONG
+Formulas:
+Useful instr. rate [%] = (PM_RUN_INST_CMPL/PM_INST_DISP)*100
+Processor Utilization [%] = (PM_RUN_SPURR/PM_RUN_PURR)*100
+--
+This performance group shows the overhead of speculative
+execution of instructions and the processor utilization.
diff --git a/groups/sandybridge/CLOCK.txt b/groups/sandybridge/CLOCK.txt
index 68ed016f5..a888d6630 100644
--- a/groups/sandybridge/CLOCK.txt
+++ b/groups/sandybridge/CLOCK.txt
@@ -20,7 +20,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Power DRAM =  PWR_DRAM_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
diff --git a/groups/sandybridge/CYCLE_ACTIVITY.txt b/groups/sandybridge/CYCLE_ACTIVITY.txt
index 0b28a48c8..8dbfe257c 100644
--- a/groups/sandybridge/CYCLE_ACTIVITY.txt
+++ b/groups/sandybridge/CYCLE_ACTIVITY.txt
@@ -18,6 +18,7 @@ Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
 Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/sandybridge/CYCLE_STALLS.txt b/groups/sandybridge/CYCLE_STALLS.txt
index feb511b79..d66cbb124 100644
--- a/groups/sandybridge/CYCLE_STALLS.txt
+++ b/groups/sandybridge/CYCLE_STALLS.txt
@@ -21,6 +21,7 @@ Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
 Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/sandybridge/DIVIDE.txt b/groups/sandybridge/DIVIDE.txt
index 9c22a2f09..504181cf7 100644
--- a/groups/sandybridge/DIVIDE.txt
+++ b/groups/sandybridge/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/sandybridge/ENERGY.txt b/groups/sandybridge/ENERGY.txt
index 2b466c8de..9898c70f4 100644
--- a/groups/sandybridge/ENERGY.txt
+++ b/groups/sandybridge/ENERGY.txt
@@ -26,7 +26,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W]  PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/sandybridge/FALSE_SHARE.txt b/groups/sandybridge/FALSE_SHARE.txt
index a87f7d4e6..fbec3f4fb 100644
--- a/groups/sandybridge/FALSE_SHARE.txt
+++ b/groups/sandybridge/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 -
diff --git a/groups/sandybridge/FLOPS_AVX.txt b/groups/sandybridge/FLOPS_AVX.txt
index b4ae4e76d..5a3f14f72 100644
--- a/groups/sandybridge/FLOPS_AVX.txt
+++ b/groups/sandybridge/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX FLOPs rates.
 Please note that the current FLOP measurements on SandyBridge are
diff --git a/groups/sandybridge/FLOPS_DP.txt b/groups/sandybridge/FLOPS_DP.txt
index 8e0e88fa4..91f8a86ea 100644
--- a/groups/sandybridge/FLOPS_DP.txt
+++ b/groups/sandybridge/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates.
diff --git a/groups/sandybridge/FLOPS_SP.txt b/groups/sandybridge/FLOPS_SP.txt
index 994472dba..930a9881f 100644
--- a/groups/sandybridge/FLOPS_SP.txt
+++ b/groups/sandybridge/FLOPS_SP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates.
diff --git a/groups/sandybridge/PORT_USAGE.txt b/groups/sandybridge/PORT_USAGE.txt
index 68d66302b..d509607cd 100644
--- a/groups/sandybridge/PORT_USAGE.txt
+++ b/groups/sandybridge/PORT_USAGE.txt
@@ -28,12 +28,12 @@ Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port1 usage ratio UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port2 usage ratio UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port3 usage ratio UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port4 usage ratio UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port5 usage ratio UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/sandybridge/TMA.txt b/groups/sandybridge/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/sandybridge/TMA.txt
+++ b/groups/sandybridge/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/sandybridge/UOPS.txt b/groups/sandybridge/UOPS.txt
index 8f697ff51..a4d35d834 100644
--- a/groups/sandybridge/UOPS.txt
+++ b/groups/sandybridge/UOPS.txt
@@ -20,7 +20,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Executed UOPs = UOPS_EXECUTED_THREAD
 Retired UOPs = UOPS_RETIRED_ALL
diff --git a/groups/sandybridgeEP/CACHES.txt b/groups/sandybridgeEP/CACHES.txt
index b7ca833ec..b5175a361 100644
--- a/groups/sandybridgeEP/CACHES.txt
+++ b/groups/sandybridgeEP/CACHES.txt
@@ -86,7 +86,7 @@ L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE
 L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
-Memory write bandwidth [MBytes/s] 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
 Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
 Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
diff --git a/groups/sandybridgeEP/CLOCK.txt b/groups/sandybridgeEP/CLOCK.txt
index 68ed016f5..a888d6630 100644
--- a/groups/sandybridgeEP/CLOCK.txt
+++ b/groups/sandybridgeEP/CLOCK.txt
@@ -20,7 +20,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Power DRAM =  PWR_DRAM_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
diff --git a/groups/sandybridgeEP/CYCLE_ACTIVITY.txt b/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
index 0b28a48c8..8dbfe257c 100644
--- a/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
+++ b/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
@@ -18,6 +18,7 @@ Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
 Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/sandybridgeEP/CYCLE_STALLS.txt b/groups/sandybridgeEP/CYCLE_STALLS.txt
index feb511b79..d66cbb124 100644
--- a/groups/sandybridgeEP/CYCLE_STALLS.txt
+++ b/groups/sandybridgeEP/CYCLE_STALLS.txt
@@ -21,6 +21,7 @@ Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
 Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/sandybridgeEP/DIVIDE.txt b/groups/sandybridgeEP/DIVIDE.txt
index 9c22a2f09..504181cf7 100644
--- a/groups/sandybridgeEP/DIVIDE.txt
+++ b/groups/sandybridgeEP/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/sandybridgeEP/ENERGY.txt b/groups/sandybridgeEP/ENERGY.txt
index e5e2b3308..1ab4ef339 100644
--- a/groups/sandybridgeEP/ENERGY.txt
+++ b/groups/sandybridgeEP/ENERGY.txt
@@ -23,7 +23,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W]  PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/sandybridgeEP/FALSE_SHARE.txt b/groups/sandybridgeEP/FALSE_SHARE.txt
index be9c66cac..27f568a0a 100644
--- a/groups/sandybridgeEP/FALSE_SHARE.txt
+++ b/groups/sandybridgeEP/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
 -
diff --git a/groups/sandybridgeEP/FLOPS_AVX.txt b/groups/sandybridgeEP/FLOPS_AVX.txt
index b4ae4e76d..5a3f14f72 100644
--- a/groups/sandybridgeEP/FLOPS_AVX.txt
+++ b/groups/sandybridgeEP/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX FLOPs rates.
 Please note that the current FLOP measurements on SandyBridge are
diff --git a/groups/sandybridgeEP/FLOPS_DP.txt b/groups/sandybridgeEP/FLOPS_DP.txt
index 8e0e88fa4..91f8a86ea 100644
--- a/groups/sandybridgeEP/FLOPS_DP.txt
+++ b/groups/sandybridgeEP/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates.
diff --git a/groups/sandybridgeEP/FLOPS_SP.txt b/groups/sandybridgeEP/FLOPS_SP.txt
index 994472dba..930a9881f 100644
--- a/groups/sandybridgeEP/FLOPS_SP.txt
+++ b/groups/sandybridgeEP/FLOPS_SP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates.
diff --git a/groups/sandybridgeEP/MEM_DP.txt b/groups/sandybridgeEP/MEM_DP.txt
index 0406226e9..f2d68ba61 100644
--- a/groups/sandybridgeEP/MEM_DP.txt
+++ b/groups/sandybridgeEP/MEM_DP.txt
@@ -28,9 +28,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
@@ -40,13 +40,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
diff --git a/groups/sandybridgeEP/MEM_SP.txt b/groups/sandybridgeEP/MEM_SP.txt
index f78f56aaa..955cdc4ed 100644
--- a/groups/sandybridgeEP/MEM_SP.txt
+++ b/groups/sandybridgeEP/MEM_SP.txt
@@ -28,9 +28,9 @@ Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
@@ -40,13 +40,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
 MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFLOP/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
diff --git a/groups/sandybridgeEP/NUMA.txt b/groups/sandybridgeEP/NUMA.txt
index 0c1b8fb04..41fbe6288 100644
--- a/groups/sandybridgeEP/NUMA.txt
+++ b/groups/sandybridgeEP/NUMA.txt
@@ -20,7 +20,7 @@ Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
 Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
 Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
diff --git a/groups/sandybridgeEP/PORT_USAGE.txt b/groups/sandybridgeEP/PORT_USAGE.txt
index 68d66302b..d509607cd 100644
--- a/groups/sandybridgeEP/PORT_USAGE.txt
+++ b/groups/sandybridgeEP/PORT_USAGE.txt
@@ -28,12 +28,12 @@ Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port1 usage ratio UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port2 usage ratio UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port3 usage ratio UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port4 usage ratio UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port5 usage ratio UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/sandybridgeEP/QPI.txt b/groups/sandybridgeEP/QPI.txt
index f09df0387..320614f1a 100644
--- a/groups/sandybridgeEP/QPI.txt
+++ b/groups/sandybridgeEP/QPI.txt
@@ -25,11 +25,11 @@ Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2
 
 LONG
 Formulas:
-Received bandwidth from QPI [MBytes/s] 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time
-Received data volume from QPI [GBytes] 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8
-Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time
-Data volume QPI to LLC [GBytes] 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64
-Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time
-Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))
+Received bandwidth from QPI [MBytes/s] = 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time
+Received data volume from QPI [GBytes] = 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8
+Bandwidth QPI to LLC [MBytes/s] = 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time
+Data volume QPI to LLC [GBytes] = 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64
+Bandwidth QPI to HA or IIO [MBytes/s] = 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time
+Data volume QPI to HA or IIO [GBytes] = 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))
 -
 Profiling group to measure traffic on the QPI.
diff --git a/groups/sandybridgeEP/TMA.txt b/groups/sandybridgeEP/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/sandybridgeEP/TMA.txt
+++ b/groups/sandybridgeEP/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/sandybridgeEP/UOPS.txt b/groups/sandybridgeEP/UOPS.txt
index 8f697ff51..a4d35d834 100644
--- a/groups/sandybridgeEP/UOPS.txt
+++ b/groups/sandybridgeEP/UOPS.txt
@@ -20,7 +20,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Executed UOPs = UOPS_EXECUTED_THREAD
 Retired UOPs = UOPS_RETIRED_ALL
diff --git a/groups/silvermont/CLOCK.txt b/groups/silvermont/CLOCK.txt
index 088a776b1..b2174c82b 100644
--- a/groups/silvermont/CLOCK.txt
+++ b/groups/silvermont/CLOCK.txt
@@ -15,7 +15,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
diff --git a/groups/silvermont/DIVIDE.txt b/groups/silvermont/DIVIDE.txt
index d7f02f05e..470097e9f 100644
--- a/groups/silvermont/DIVIDE.txt
+++ b/groups/silvermont/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC1
 Avg. divide unit usage duration PMC0/PMC1
 
 LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_ANY:EDGEDETECT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY_ANY/CYCLES_DIV_BUSY_ANY:EDGEDETECT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/silvermont/ENERGY.txt b/groups/silvermont/ENERGY.txt
index 96ede02f9..73939a3f0 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -20,7 +20,7 @@ Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 -
diff --git a/groups/skylake/CLOCK.txt b/groups/skylake/CLOCK.txt
index 591451d5a..d682e3ae1 100644
--- a/groups/skylake/CLOCK.txt
+++ b/groups/skylake/CLOCK.txt
@@ -20,7 +20,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Power DRAM =  PWR_DRAM_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
diff --git a/groups/skylake/CYCLE_ACTIVITY.txt b/groups/skylake/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/skylake/CYCLE_ACTIVITY.txt
+++ b/groups/skylake/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/skylake/CYCLE_STALLS.txt b/groups/skylake/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/skylake/CYCLE_STALLS.txt
+++ b/groups/skylake/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/skylake/DIVIDE.txt b/groups/skylake/DIVIDE.txt
index 4f5a0eabf..40b4ab6f3 100644
--- a/groups/skylake/DIVIDE.txt
+++ b/groups/skylake/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/skylake/ENERGY.txt b/groups/skylake/ENERGY.txt
index 06baa720e..07dbda527 100644
--- a/groups/skylake/ENERGY.txt
+++ b/groups/skylake/ENERGY.txt
@@ -28,7 +28,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power PP1 = PWR_PP1_ENERGY / time
diff --git a/groups/skylake/FALSE_SHARE.txt b/groups/skylake/FALSE_SHARE.txt
index 626277a8d..65ff4d413 100644
--- a/groups/skylake/FALSE_SHARE.txt
+++ b/groups/skylake/FALSE_SHARE.txt
@@ -16,7 +16,7 @@ Local LLC false sharing [MByte] 1.E-06*PMC0*64
 Local LLC false sharing rate PMC0/PMC2
 
 LONG
-Formula:
+Formulas:
 Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM*64
 Local LLC false sharing rate = MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM/MEM_INST_RETIRED_ALL
 -
diff --git a/groups/skylake/FLOPS_AVX.txt b/groups/skylake/FLOPS_AVX.txt
index 6088bca3e..ebde7476b 100644
--- a/groups/skylake/FLOPS_AVX.txt
+++ b/groups/skylake/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX FLOPs rates.
 
diff --git a/groups/skylake/FLOPS_DP.txt b/groups/skylake/FLOPS_DP.txt
index 010a8eea8..ff7a83327 100644
--- a/groups/skylake/FLOPS_DP.txt
+++ b/groups/skylake/FLOPS_DP.txt
@@ -13,18 +13,18 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates.
diff --git a/groups/skylake/FLOPS_SP.txt b/groups/skylake/FLOPS_SP.txt
index 219868c8c..3a7d56b6d 100644
--- a/groups/skylake/FLOPS_SP.txt
+++ b/groups/skylake/FLOPS_SP.txt
@@ -13,19 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/skylake/MEM_DP.txt b/groups/skylake/MEM_DP.txt
index 63926faf9..14a359a7a 100644
--- a/groups/skylake/MEM_DP.txt
+++ b/groups/skylake/MEM_DP.txt
@@ -21,10 +21,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
 Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
 Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
@@ -34,12 +34,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C1+MBOX0C2)*64.0)
 
 LONG
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime
diff --git a/groups/skylake/MEM_SP.txt b/groups/skylake/MEM_SP.txt
index b2f58aec0..0b4705298 100644
--- a/groups/skylake/MEM_SP.txt
+++ b/groups/skylake/MEM_SP.txt
@@ -21,10 +21,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
 Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
 Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
@@ -37,10 +37,10 @@ LONG
 Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime
diff --git a/groups/skylake/PORT_USAGE.txt b/groups/skylake/PORT_USAGE.txt
index 1cfe431a7..eca8f2ae8 100644
--- a/groups/skylake/PORT_USAGE.txt
+++ b/groups/skylake/PORT_USAGE.txt
@@ -32,14 +32,14 @@ Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
 
 LONG
 Formulas:
-Port0 usage ratio UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port1 usage ratio UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port2 usage ratio UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port3 usage ratio UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port4 usage ratio UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port5 usage ratio UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port6 usage ratio UOPS_DISPATCHED_PORT_PORT_6/SUM(UOPS_DISPATCHED_PORT_PORT_*)
-Port7 usage ratio UOPS_DISPATCHED_PORT_PORT_7/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port6 usage ratio = UOPS_DISPATCHED_PORT_PORT_6/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port7 usage ratio = UOPS_DISPATCHED_PORT_PORT_7/SUM(UOPS_DISPATCHED_PORT_PORT_*)
 -
 This group measures the execution port utilization in a CPU core. The group can
 only be measured when HyperThreading is disabled because only then each CPU core
diff --git a/groups/skylake/TMA.txt b/groups/skylake/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/skylake/TMA.txt
+++ b/groups/skylake/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/skylake/UOPS.txt b/groups/skylake/UOPS.txt
index fbb01e1d1..c0a86f22a 100644
--- a/groups/skylake/UOPS.txt
+++ b/groups/skylake/UOPS.txt
@@ -20,7 +20,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Executed UOPs = UOPS_EXECUTED_THREAD
 Retired UOPs = UOPS_RETIRED_ALL
diff --git a/groups/skylakeX/CLOCK.txt b/groups/skylakeX/CLOCK.txt
index 5ff9f6961..b81bee6d1 100644
--- a/groups/skylakeX/CLOCK.txt
+++ b/groups/skylakeX/CLOCK.txt
@@ -17,7 +17,7 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 
 LONG
-Formula:
+Formulas:
 Power =  PWR_PKG_ENERGY / time
 Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
 -
diff --git a/groups/skylakeX/CYCLE_ACTIVITY.txt b/groups/skylakeX/CYCLE_ACTIVITY.txt
index 494222c0e..c432a446d 100644
--- a/groups/skylakeX/CYCLE_ACTIVITY.txt
+++ b/groups/skylakeX/CYCLE_ACTIVITY.txt
@@ -20,6 +20,7 @@ Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
 Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
 Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
diff --git a/groups/skylakeX/CYCLE_STALLS.txt b/groups/skylakeX/CYCLE_STALLS.txt
index 4ef993a95..795aeb9e9 100644
--- a/groups/skylakeX/CYCLE_STALLS.txt
+++ b/groups/skylakeX/CYCLE_STALLS.txt
@@ -24,6 +24,7 @@ Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
 Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
 
 LONG
+Formulas:
 Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
 Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
 Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
diff --git a/groups/skylakeX/DIVIDE.txt b/groups/skylakeX/DIVIDE.txt
index 4f5a0eabf..40b4ab6f3 100644
--- a/groups/skylakeX/DIVIDE.txt
+++ b/groups/skylakeX/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/skylakeX/ENERGY.txt b/groups/skylakeX/ENERGY.txt
index 28f02567e..fe7829fbe 100644
--- a/groups/skylakeX/ENERGY.txt
+++ b/groups/skylakeX/ENERGY.txt
@@ -25,7 +25,7 @@ Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
-Formula:
+Formulas:
 Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
diff --git a/groups/skylakeX/FLOPS_AVX.txt b/groups/skylakeX/FLOPS_AVX.txt
index d3ea8004b..d65b522be 100644
--- a/groups/skylakeX/FLOPS_AVX.txt
+++ b/groups/skylakeX/FLOPS_AVX.txt
@@ -14,13 +14,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Packed SP MFLOP/s  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
-Packed DP MFLOP/s  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
 
 LONG
-Formula:
-Packed SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*16)/runtime
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*16)/runtime
 -
 Packed 32b AVX FLOPs rates.
 
diff --git a/groups/skylakeX/FLOPS_DP.txt b/groups/skylakeX/FLOPS_DP.txt
index 11091902a..177cff2d0 100644
--- a/groups/skylakeX/FLOPS_DP.txt
+++ b/groups/skylakeX/FLOPS_DP.txt
@@ -14,21 +14,20 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
-AVX512 DP MFLOP/s  1.0E-06*(PMC3*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
 
 LONG
-Formula:
-DP MFLOP/s =
-1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX512 DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
 -
 SSE scalar and packed double precision FLOP rates.
diff --git a/groups/skylakeX/FLOPS_SP.txt b/groups/skylakeX/FLOPS_SP.txt
index 2dc3428c6..01d98c2f2 100644
--- a/groups/skylakeX/FLOPS_SP.txt
+++ b/groups/skylakeX/FLOPS_SP.txt
@@ -14,22 +14,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
-AVX512 SP MFLOP/s  1.0E-06*(PMC3*16.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
 
 LONG
-Formula:
-SP MFLOP/s =
-1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX512 SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
-Vectorization ratio 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
 -
 SSE scalar and packed single precision FLOP rates.
 
diff --git a/groups/skylakeX/L3.txt b/groups/skylakeX/L3.txt
index 98d1d9e3f..895bf3f37 100644
--- a/groups/skylakeX/L3.txt
+++ b/groups/skylakeX/L3.txt
@@ -6,6 +6,9 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L2_LINES_IN_ALL
 PMC1  L2_TRANS_L2_WB
+PMC2  IDI_MISC_WB_DOWNGRADE
+PMC3  IDI_MISC_WB_UPGRADE
+
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -14,8 +17,12 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
 L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
-L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
-L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L3|MEM evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3|MEM evict data volume [GBytes]  1.0E-09*PMC1*64.0
+Dropped CLs bandwidth [MBytes/s] 1.0E-9*PMC2*64.0/time
+Dropped CLs data volume [GBytes] 1.0E-9*PMC2*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
@@ -23,14 +30,20 @@ LONG
 Formulas:
 L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
 L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
-L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
-L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*IDI_MISC_WB_UPGRADE*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*IDI_MISC_WB_UPGRADE*64.0
+Dropped CLs bandwidth [MBytes/s] = 1.0E-9*IDI_MISC_WB_DOWNGRADE*64.0/time
+Dropped CLs data volume [GBytes] = 1.0E-9*IDI_MISC_WB_DOWNGRADE*64.0
+# All data leaving L2. CLs might get dropped, evicted to L3 or Memory
+L3|MEM evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3|MEM evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
 L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
 L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
 -
-Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
-number of cache line allocated in the L2 and the number of modified cache lines
-evicted from the L2. This group also output data volume transferred between the
-L3 and measured cores L2 caches. Note that this bandwidth also includes data
-transfers due to a write allocate load on a store miss in L2.
-
+Profiling group to measure L3 cache bandwidth and data volume. For Intel Skylake
+or Cascadelake, the L3 is a victim cache. This means that all data is loaded
+from memory directly into the L2 cache (if L3 prefetcher is inactive). Modified
+data in L2 is evicted to L3 (additional data transfer due to non-inclusivenss of
+L3 can be measured). Clean cache lines (only loaded data) might get dropped in
+L2 to reduce traffic. If amount of clean cache lines is smaller than L3, it
+might be evicted to L3 due to some heuristic.
diff --git a/groups/skylakeX/MEM_DP.txt b/groups/skylakeX/MEM_DP.txt
index 518fad723..d6e481a86 100644
--- a/groups/skylakeX/MEM_DP.txt
+++ b/groups/skylakeX/MEM_DP.txt
@@ -32,10 +32,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
-AVX DP MFLOP/s  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
@@ -45,13 +45,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-AVX DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
diff --git a/groups/skylakeX/MEM_SP.txt b/groups/skylakeX/MEM_SP.txt
index 79dabcc99..5720938a0 100644
--- a/groups/skylakeX/MEM_SP.txt
+++ b/groups/skylakeX/MEM_SP.txt
@@ -32,10 +32,10 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
-SP MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
-AVX SP MFLOP/s  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2+PMC3)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
 Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
 Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
@@ -45,13 +45,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBO
 Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
 
 LONG
-Formula:
+Formulas:
 Power [W] = PWR_PKG_ENERGY/runtime
 Power DRAM [W] = PWR_DRAM_ENERGY/runtime
-SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-AVX SP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
 Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
diff --git a/groups/skylakeX/TMA.txt b/groups/skylakeX/TMA.txt
index 6aac3235e..afb412617 100644
--- a/groups/skylakeX/TMA.txt
+++ b/groups/skylakeX/TMA.txt
@@ -25,6 +25,7 @@ Retiring [%] PMC1/(4*FIXC1)*100
 Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
 
 LONG
+Formulas:
 Total Slots = 4*CPU_CLK_UNHALTED_CORE
 Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
 Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
diff --git a/groups/westmere/CLOCK.txt b/groups/westmere/CLOCK.txt
index 9139668fb..5f862a56b 100644
--- a/groups/westmere/CLOCK.txt
+++ b/groups/westmere/CLOCK.txt
@@ -12,7 +12,10 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 
 LONG
-Formula:
+Formulas:
+Runtime (RDTSC) [s] = time
+Runtime unhalted [s] = CPU_CLK_UNHALTED_CORE*inverseClock
+Clock [MHz] = 1.E-06*(CPU_CLK_UNHALTED_CORE/CPU_CLK_UNHALTED_REF)/inverseClock
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 -
-
-
+CPU clock information
diff --git a/groups/westmere/DIVIDE.txt b/groups/westmere/DIVIDE.txt
index fae309aea..2677a1934 100644
--- a/groups/westmere/DIVIDE.txt
+++ b/groups/westmere/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/westmere/FLOPS_DP.txt b/groups/westmere/FLOPS_DP.txt
index 3ee6ebca8..c5c82038e 100644
--- a/groups/westmere/FLOPS_DP.txt
+++ b/groups/westmere/FLOPS_DP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 Westmere has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/westmere/FLOPS_SP.txt b/groups/westmere/FLOPS_SP.txt
index 2b0f6a353..933b05836 100644
--- a/groups/westmere/FLOPS_SP.txt
+++ b/groups/westmere/FLOPS_SP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 Westmere has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/westmere/FLOPS_X87.txt b/groups/westmere/FLOPS_X87.txt
index a4176f046..39cd8b49d 100644
--- a/groups/westmere/FLOPS_X87.txt
+++ b/groups/westmere/FLOPS_X87.txt
@@ -11,8 +11,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
 Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/westmere/MEM_DP.txt b/groups/westmere/MEM_DP.txt
index 828f98ca8..64161dd70 100644
--- a/groups/westmere/MEM_DP.txt
+++ b/groups/westmere/MEM_DP.txt
@@ -18,11 +18,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
 Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
@@ -39,11 +39,11 @@ Operational intensity (PMC0*2.0+PMC1)/((UPMC0+UPMC1)*64.0)
 
 LONG
 Formulas:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
@@ -56,7 +56,7 @@ Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITE
 Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
 Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
 Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
-Operational intensity (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 This group will be measured by one core per socket. The remote read BW tells
diff --git a/groups/westmere/MEM_SP.txt b/groups/westmere/MEM_SP.txt
index d4f821144..812c7fa03 100644
--- a/groups/westmere/MEM_SP.txt
+++ b/groups/westmere/MEM_SP.txt
@@ -18,11 +18,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
 Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
 Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
@@ -39,11 +39,11 @@ Operational intensity (PMC0*4.0+PMC1)/((UPMC0+UPMC1)*64.0)
 
 LONG
 Formulas:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
 Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
 Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
@@ -56,7 +56,7 @@ Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITE
 Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
 Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
 Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
-Operational intensity (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 This group will be measured by one core per socket. The remote read BW tells
diff --git a/groups/westmere/UOPS.txt b/groups/westmere/UOPS.txt
index 2567704ac..b2446aab3 100644
--- a/groups/westmere/UOPS.txt
+++ b/groups/westmere/UOPS.txt
@@ -22,7 +22,7 @@ Executed UOPs PMC1
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FUSED
 Executed UOPs = UOPS_EXECUTED_THREAD
diff --git a/groups/westmere/VIEW.txt b/groups/westmere/VIEW.txt
index 76809eda1..38d907cbb 100644
--- a/groups/westmere/VIEW.txt
+++ b/groups/westmere/VIEW.txt
@@ -19,12 +19,12 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFLOP/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s] (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
+SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
 Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
 Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
@@ -32,18 +32,18 @@ Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
 Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
 
 LONG
-Formula:
-DP MFLOP/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFLOP/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-Packed MUOPS/s   1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
-Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
-SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
-DP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
-Memory bandwidth [MBytes/s] 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Formulas:
+DP [MFLOP/s] =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+SP [MFLOP/s] =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+Packed [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
+Scalar [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
+SP [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
+DP [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
+Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time
+Remote Write BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Remote BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
 -
 This is a overview group using the capabilities of Westmere to measure multiple events at
 the same time.
diff --git a/groups/westmereEX/DIVIDE.txt b/groups/westmereEX/DIVIDE.txt
index fae309aea..2677a1934 100644
--- a/groups/westmereEX/DIVIDE.txt
+++ b/groups/westmereEX/DIVIDE.txt
@@ -17,4 +17,8 @@ Number of divide ops PMC0
 Avg. divide unit usage duration PMC1/PMC0
 
 LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/westmereEX/FLOPS_DP.txt b/groups/westmereEX/FLOPS_DP.txt
index 01160e6ca..0c2e56c11 100644
--- a/groups/westmereEX/FLOPS_DP.txt
+++ b/groups/westmereEX/FLOPS_DP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFLOP/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-DP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/westmereEX/FLOPS_SP.txt b/groups/westmereEX/FLOPS_SP.txt
index f8e39229b..d7c8e8ecb 100644
--- a/groups/westmereEX/FLOPS_SP.txt
+++ b/groups/westmereEX/FLOPS_SP.txt
@@ -14,19 +14,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFLOP/s 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
 
 LONG
-Formula:
-SP MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Westmere EX has no possibility to measure MFLOPs if mixed precision calculations are done.
 Therefore both single as well as double precision are measured to ensure the correctness
diff --git a/groups/westmereEX/FLOPS_X87.txt b/groups/westmereEX/FLOPS_X87.txt
index a4176f046..39cd8b49d 100644
--- a/groups/westmereEX/FLOPS_X87.txt
+++ b/groups/westmereEX/FLOPS_X87.txt
@@ -11,8 +11,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-X87 MFLOP/s  1.0E-06*PMC0/time
+X87 [MFLOP/s]  1.0E-06*PMC0/time
 
 LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
 Profiling group to measure X87 FLOP rate.
 
diff --git a/groups/westmereEX/NUMA.txt b/groups/westmereEX/NUMA.txt
index 0c1b8fb04..41fbe6288 100644
--- a/groups/westmereEX/NUMA.txt
+++ b/groups/westmereEX/NUMA.txt
@@ -20,7 +20,7 @@ Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
 Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
 
 LONG
-Formula:
+Formulas:
 CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
 Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
 Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
diff --git a/groups/westmereEX/UOPS.txt b/groups/westmereEX/UOPS.txt
index 59d2e091c..f58fda6a0 100644
--- a/groups/westmereEX/UOPS.txt
+++ b/groups/westmereEX/UOPS.txt
@@ -20,7 +20,7 @@ Merged UOPs PMC3
 Retired UOPs PMC2
 
 LONG
-Formula:
+Formulas:
 Issued UOPs = UOPS_ISSUED_ANY
 Merged UOPs = UOPS_ISSUED_FUSED
 Retired UOPs = UOPS_RETIRED_ANY
diff --git a/groups/zen/DIVIDE.txt b/groups/zen/DIVIDE.txt
index f071f5254..c98500b9a 100644
--- a/groups/zen/DIVIDE.txt
+++ b/groups/zen/DIVIDE.txt
@@ -18,4 +18,9 @@ Number of divide ops PMC2
 Avg. divide unit usage duration PMC3/PMC2
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+Number of divide ops = DIV_OP_COUNT
+Avg. divide unit usage duration = DIV_BUSY_CYCLES/DIV_OP_COUNT
+-
 This performance group measures the average latency of divide operations
diff --git a/groups/zen/ENERGY.txt b/groups/zen/ENERGY.txt
index 7600ed352..f58c5b10e 100644
--- a/groups/zen/ENERGY.txt
+++ b/groups/zen/ENERGY.txt
@@ -21,9 +21,9 @@ Energy PKG [J]  PWR1
 Power PKG [W] PWR1/time
 
 LONG
-Formula:
-Power Core [W] RAPL_CORE_ENERGY/time
-Power PKG [W] RAPL_PKG_ENERGY/time
+Formulas:
+Power Core [W] = RAPL_CORE_ENERGY/time
+Power PKG [W] = RAPL_PKG_ENERGY/time
 -
 Ryzen implements the RAPL interface previously introduced by Intel.
 This interface enables to monitor the consumed energy on the core and package
diff --git a/groups/zen/FLOPS_DP.txt b/groups/zen/FLOPS_DP.txt
index d0e7816fc..fd3d73a50 100644
--- a/groups/zen/FLOPS_DP.txt
+++ b/groups/zen/FLOPS_DP.txt
@@ -13,20 +13,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s]   FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI   FIXC0/PMC0
-DP MFLOP/s (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*2))/time
-DP MFLOP/s (SSE assumed)   1.0E-06*(PMC3+PMC2+(PMC1*2))/time
-DP MFLOP/s (AVX assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*2))/time
+DP [MFLOP/s] (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*2))/time
+DP [MFLOP/s] (SSE assumed)   1.0E-06*(PMC3+PMC2+(PMC1*2))/time
+DP [MFLOP/s] (AVX assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*2))/time
 
 LONG
 Formulas:
 CPI = INST_RETIRED_ANY/ACTUAL_CPU_CLOCK
-DP MFLOP/s (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
-DP MFLOP/s (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + RETIRED_MMX_FP_INSTR_ALL+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
-DP MFLOP/s (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+DP [MFLOP/s] (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+DP [MFLOP/s] (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + RETIRED_MMX_FP_INSTR_ALL+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+DP [MFLOP/s] (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
 -
 Profiling group to measure double precisision FLOP rate. The Zen architecture
-does not provide distinct events for SSE and AVX FLOPs. Moreover, scalar FP
-instructions are counted as SSE instruction in RETIRED_MMX_FP_INSTR_ALL.
-Therefore, you have to select the DP MFLOP/s metric based on the measured code.
+does not provide distinct events for SSE and AVX FLOPs. Moreover, all FP
+instructions are counted in RETIRED_MMX_FP_INSTR_ALL.
+Therefore, you have to select the SP MFLOP/s metric based on the measured code.
+Moreover, the SSE and AVX metrics overcount but SSE metrics are more accurate.
 
 
diff --git a/groups/zen/FLOPS_SP.txt b/groups/zen/FLOPS_SP.txt
index 0e97b8804..4c3ebc422 100644
--- a/groups/zen/FLOPS_SP.txt
+++ b/groups/zen/FLOPS_SP.txt
@@ -14,21 +14,20 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s]   FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI   FIXC1/PMC0
-SP MFLOP/s (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*4))/time
-SP MFLOP/s (SSE assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*4))/time
-SP MFLOP/s (AVX assumed)   1.0E-06*(PMC3+(PMC2*4)+(PMC1*4))/time
+SP [MFLOP/s] (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*4))/time
+SP [MFLOP/s] (SSE assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*4))/time
+SP [MFLOP/s] (AVX assumed)   1.0E-06*(PMC3+(PMC2*4)+(PMC1*4))/time
 
 
 LONG
 Formulas:
 CPI = INST_RETIRED_ANY/ACTUAL_CPU_CLOCK
-SP MFLOP/s (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
-SP MFLOP/s (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
-SP MFLOP/s (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*4)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+SP [MFLOP/s] (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+SP [MFLOP/s] (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+SP [MFLOP/s] (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*4)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
 -
 Profiling group to measure single precisision FLOP rate. The Zen architecture
-does not provide distinct events for SSE and AVX FLOPs. Moreover, scalar FP
-instructions are counted as SSE instruction in RETIRED_MMX_FP_INSTR_ALL.
+does not provide distinct events for SSE and AVX FLOPs. Moreover, all FP
+instructions are counted in RETIRED_MMX_FP_INSTR_ALL.
 Therefore, you have to select the SP MFLOP/s metric based on the measured code.
-
-
+Moreover, the SSE and AVX metrics overcount but SSE metrics are more accurate.
diff --git a/groups/zen/L2.txt b/groups/zen/L2.txt
new file mode 100644
index 000000000..d3b7a2839
--- /dev/null
+++ b/groups/zen/L2.txt
@@ -0,0 +1,32 @@
+SHORT L2 cache bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  MAB_ALLOC_PIPE_STORE
+PMC3  DATA_CACHE_REFILLS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC3*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*MAB_ALLOC_PIPE_STORE*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*MAB_ALLOC_PIPE_STORE*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+MAB_ALLOC_PIPE_STORE)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+MAB_ALLOC_PIPE_STORE)*64
+-
+Profiling group to measure L2 cache bandwidth. This group is experimental!
diff --git a/groups/zen/MEM.txt b/groups/zen/MEM.txt
new file mode 100644
index 000000000..c399fb253
--- /dev/null
+++ b/groups/zen/MEM.txt
@@ -0,0 +1,37 @@
+SHORT Main memory bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(DFC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(DFC0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(DFC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(DFC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL)*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+Even though the group provides almost accurate results for the total bandwidth
+and data volume, the read and write bandwidths and data volumes seem off. AMD
+describes this metric are "approximate" in the documentation for AMD Rome.
diff --git a/groups/zen/TLB.txt b/groups/zen/TLB.txt
index 7730d1c82..510284ba2 100644
--- a/groups/zen/TLB.txt
+++ b/groups/zen/TLB.txt
@@ -23,12 +23,12 @@ L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
 
 LONG
 Formulas:
-L1 DTLB request rate  DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
-L1 DTLB miss rate  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
-L1 DTLB miss ratio  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES
-L2 DTLB request rate  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
-L2 DTLB miss rate  L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS
-L2 DTLB miss ratio L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)
+L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate = L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio = L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)
 -
 L1 DTLB request  rate tells you how data intensive your code is
 or how many data accesses you have on average per instruction.
diff --git a/groups/zen2/BRANCH.txt b/groups/zen2/BRANCH.txt
new file mode 100644
index 000000000..dbaf07fd3
--- /dev/null
+++ b/groups/zen2/BRANCH.txt
@@ -0,0 +1,32 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_BRANCH_INSTR
+PMC3  RETIRED_MISP_BRANCH_INSTR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/PMC2
+Instructions per branch  PMC0/PMC2
+
+LONG
+Formulas:
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISP_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISP_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/groups/zen2/CACHE.txt b/groups/zen2/CACHE.txt
new file mode 100644
index 000000000..b773e5a2a
--- /dev/null
+++ b/groups/zen2/CACHE.txt
@@ -0,0 +1,39 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DATA_CACHE_ACCESSES
+PMC3  DATA_CACHE_REFILLS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+data cache requests PMC2
+data cache request rate PMC2/PMC0
+data cache misses PMC3
+data cache miss rate PMC3/PMC0
+data cache miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+data cache requests = DATA_CACHE_ACCESSES
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache misses = DATA_CACHE_REFILLS_ALL
+data cache miss rate = DATA_CACHE_REFILLS_ALL / RETIRED_INSTRUCTIONS
+data cache miss ratio = DATA_CACHE_REFILLS_ALL / DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/monitoring/groups/kabini/CPI.txt b/groups/zen2/CPI.txt
similarity index 53%
rename from monitoring/groups/kabini/CPI.txt
rename to groups/zen2/CPI.txt
index d599a346d..23e4f8c77 100644
--- a/monitoring/groups/kabini/CPI.txt
+++ b/groups/zen2/CPI.txt
@@ -1,16 +1,27 @@
 SHORT  Cycles per instruction
 
 EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
 PMC0  RETIRED_INSTRUCTIONS
 PMC1  CPU_CLOCKS_UNHALTED
 PMC2  RETIRED_UOPS
 
 METRICS
-CPI   PMC1/PMC0
-Cycles per UOPS  PMC1/PMC2
-IPC   PMC0/PMC1
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC PMC0/PMC1
+
 
 LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
 This group measures how efficient the processor works with
 regard to instruction throughput. Also important as a standalone
 metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
diff --git a/groups/zen2/DATA.txt b/groups/zen2/DATA.txt
new file mode 100644
index 000000000..e061b9043
--- /dev/null
+++ b/groups/zen2/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  LS_DISPATCH_LOADS
+PMC3  LS_DISPATCH_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Load to store ratio PMC2/PMC3
+
+LONG
+Formulas:
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/groups/zen2/DIVIDE.txt b/groups/zen2/DIVIDE.txt
new file mode 100644
index 000000000..f071f5254
--- /dev/null
+++ b/groups/zen2/DIVIDE.txt
@@ -0,0 +1,21 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DIV_OP_COUNT
+PMC3  DIV_BUSY_CYCLES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+Number of divide ops PMC2
+Avg. divide unit usage duration PMC3/PMC2
+
+LONG
+This performance group measures the average latency of divide operations
diff --git a/groups/zen2/ENERGY.txt b/groups/zen2/ENERGY.txt
new file mode 100644
index 000000000..7600ed352
--- /dev/null
+++ b/groups/zen2/ENERGY.txt
@@ -0,0 +1,32 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PWR0  RAPL_CORE_ENERGY
+PWR1  RAPL_PKG_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Energy Core [J]  PWR0
+Power Core [W] PWR0/time
+Energy PKG [J]  PWR1
+Power PKG [W] PWR1/time
+
+LONG
+Formula:
+Power Core [W] RAPL_CORE_ENERGY/time
+Power PKG [W] RAPL_PKG_ENERGY/time
+-
+Ryzen implements the RAPL interface previously introduced by Intel.
+This interface enables to monitor the consumed energy on the core and package
+domain.
+It is not documented by AMD which parts of the CPU are in which domain.
+
diff --git a/groups/zen2/FLOPS_DP.txt b/groups/zen2/FLOPS_DP.txt
new file mode 100644
index 000000000..d0e7816fc
--- /dev/null
+++ b/groups/zen2/FLOPS_DP.txt
@@ -0,0 +1,32 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA
+PMC2  RETIRED_MMX_FP_INSTR_ALL
+PMC3  RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC0/PMC0
+DP MFLOP/s (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*2))/time
+DP MFLOP/s (SSE assumed)   1.0E-06*(PMC3+PMC2+(PMC1*2))/time
+DP MFLOP/s (AVX assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*2))/time
+
+LONG
+Formulas:
+CPI = INST_RETIRED_ANY/ACTUAL_CPU_CLOCK
+DP MFLOP/s (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+DP MFLOP/s (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + RETIRED_MMX_FP_INSTR_ALL+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+DP MFLOP/s (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA*2))/time
+-
+Profiling group to measure double precisision FLOP rate. The Zen architecture
+does not provide distinct events for SSE and AVX FLOPs. Moreover, scalar FP
+instructions are counted as SSE instruction in RETIRED_MMX_FP_INSTR_ALL.
+Therefore, you have to select the DP MFLOP/s metric based on the measured code.
+
+
diff --git a/groups/zen2/FLOPS_SP.txt b/groups/zen2/FLOPS_SP.txt
new file mode 100644
index 000000000..0e97b8804
--- /dev/null
+++ b/groups/zen2/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INST_RETIRED_ANY
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_SSE_AVX_FLOPS_SINGLE_FMA
+PMC2  RETIRED_MMX_FP_INSTR_ALL
+PMC3  RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+SP MFLOP/s (scalar assumed)   1.0E-06*(PMC3+(PMC2/2)+(PMC1*4))/time
+SP MFLOP/s (SSE assumed)   1.0E-06*(PMC3+(PMC2*2)+(PMC1*4))/time
+SP MFLOP/s (AVX assumed)   1.0E-06*(PMC3+(PMC2*4)+(PMC1*4))/time
+
+
+LONG
+Formulas:
+CPI = INST_RETIRED_ANY/ACTUAL_CPU_CLOCK
+SP MFLOP/s (scalar assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL/2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+SP MFLOP/s (SSE assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*2)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+SP MFLOP/s (AVX assumed) = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV + (RETIRED_MMX_FP_INSTR_ALL*4)+(RETIRED_SSE_AVX_FLOPS_SINGLE_FMA*4))/time
+-
+Profiling group to measure single precisision FLOP rate. The Zen architecture
+does not provide distinct events for SSE and AVX FLOPs. Moreover, scalar FP
+instructions are counted as SSE instruction in RETIRED_MMX_FP_INSTR_ALL.
+Therefore, you have to select the SP MFLOP/s metric based on the measured code.
+
+
diff --git a/groups/zen2/ICACHE.txt b/groups/zen2/ICACHE.txt
new file mode 100644
index 000000000..f98c28af5
--- /dev/null
+++ b/groups/zen2/ICACHE.txt
@@ -0,0 +1,28 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  ICACHE_FETCHES
+PMC2  ICACHE_L2_REFILLS
+PMC3  ICACHE_SYSTEM_REFILLS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+L1I request rate = ICACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/ICACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/groups/zen2/MEM.txt b/groups/zen2/MEM.txt
new file mode 100644
index 000000000..474e957f0
--- /dev/null
+++ b/groups/zen2/MEM.txt
@@ -0,0 +1,36 @@
+SHORT Main memory bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(DFC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(DFC0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(DFC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(DFC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL)*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+Even though the group provides almost accurate results for the total bandwidth
+and data volume, the read and write bandwidths and data volumes seem off.
diff --git a/groups/zen2/TLB.txt b/groups/zen2/TLB.txt
new file mode 100644
index 000000000..7730d1c82
--- /dev/null
+++ b/groups/zen2/TLB.txt
@@ -0,0 +1,39 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  L1_DTLB_MISS_ANY_L2_HIT
+PMC3  L1_DTLB_MISS_ANY_L2_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1 DTLB request rate  PMC1/PMC0
+L1 DTLB miss rate   (PMC2+PMC3)/PMC0
+L1 DTLB miss ratio   (PMC2+PMC3)/PMC1
+L2 DTLB request rate   (PMC2+PMC3)/PMC0
+L2 DTLB miss rate    PMC3/PMC0
+L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
+
+
+LONG
+Formulas:
+L1 DTLB request rate  DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES
+L2 DTLB request rate  (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate  L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)
+-
+L1 DTLB request  rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/make/config_defines.mk b/make/config_defines.mk
index e1c3b869a..f881eed32 100644
--- a/make/config_defines.mk
+++ b/make/config_defines.mk
@@ -1,17 +1,18 @@
 DEFINES   += -DVERSION=$(VERSION)         \
-		 -DRELEASE=$(RELEASE)                 \
-		 -DMINORVERSION=$(MINOR)                 \
-		 -DCFGFILE=$(CFG_FILE_PATH)           \
-		 -DTOPOFILE=$(TOPO_FILE_PATH)           \
-		 -DINSTALL_PREFIX=$(INSTALLED_PREFIX) \
-		 -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
-		 -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
-		 -DACCESSDAEMON=$(INSTALLED_ACCESSDAEMON) \
-		 -DGROUPPATH=$(LIKWIDGROUPPATH) \
-		 -DLIKWIDLOCK=$(LIKWIDLOCKPATH) \
-		 -DLIKWIDSOCKETBASE=$(LIKWIDSOCKETBASE) \
-		 -DGITCOMMIT=$(GITCOMMIT) \
-		 -D_GNU_SOURCE
+             -DRELEASE=$(RELEASE)                 \
+             -DMINORVERSION=$(MINOR)                 \
+             -DCFGFILE=$(CFG_FILE_PATH)           \
+             -DTOPOFILE=$(TOPO_FILE_PATH)           \
+             -DINSTALL_PREFIX=$(INSTALLED_PREFIX) \
+             -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
+             -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
+             -DACCESSDAEMON=$(INSTALLED_ACCESSDAEMON) \
+             -DFREQDAEMON=$(INSTALLED_FREQDAEMON) \
+             -DGROUPPATH=$(LIKWIDGROUPPATH) \
+             -DLIKWIDLOCK=$(LIKWIDLOCKPATH) \
+             -DLIKWIDSOCKETBASE=$(LIKWIDSOCKETBASE) \
+             -DGITCOMMIT=$(GITCOMMIT) \
+             -D_GNU_SOURCE
 
 COMPILER := $(strip $(COMPILER))
 
@@ -36,6 +37,10 @@ HWLOC_FOLDER := $(PWD)/ext/hwloc
 STATIC_LIBHWLOC := liblikwid-hwloc.a
 SHARED_LIBHWLOC := liblikwid-hwloc.so
 
+GOTCHA_FOLDER := $(PWD)/ext/GOTCHA
+STATIC_LIBGOTCHA := liblikwid-gotcha.a
+SHARED_LIBGOTCHA := liblikwid-gotcha.so
+
 BENCH_FOLDER := bench
 BENCH_NAME := likwid-bench
 BENCH_TARGET := $(BENCH_FOLDER)/$(BENCH_NAME)
@@ -47,62 +52,136 @@ endif
 ifeq ($(strip $(COMPILER)),MIC)
     ifeq ($(strip $(ACCESSMODE)),sysdaemon)
         $(info Info: Compiling for Xeon Phi. Changing accessmode to direct.)
-        ACCESSMODE = direct
-        BUILDDAEMON = false
+        ACCESSMODE := direct
+        BUILDDAEMON := false
+        BUILDFREQ := false
     endif
     ifeq ($(strip $(ACCESSMODE)),accessdaemon)
         $(info Info: Compiling for Xeon Phi. Changing accessmode to direct.)
-        ACCESSMODE = direct
-        BUILDDAEMON = false
+        ACCESSMODE := direct
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),perf_event)
+        $(info Info: Compiling for Xeon Phi. Changing accessmode to direct.)
+        ACCESSMODE := direct
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),direct)
+        BUILDDAEMON := false
+        BUILDFREQ := false
     endif
 endif
 
 ifeq ($(strip $(COMPILER)),GCCARMv8)
     ifeq ($(strip $(ACCESSMODE)),sysdaemon)
-        $(info Info: Compiling for ARMv8. Changing accessmode to perf_event.)
+        $(info Info: Compiling for ARMv8 architecture. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),accessdaemon)
+        $(info Info: Compiling for ARMv8 architecture. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),direct)
+        $(info Info: Compiling for ARMv8 architecture. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),perf_event)
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+endif
+
+ifeq ($(strip $(COMPILER)),GCCARMv7)
+    ifeq ($(strip $(ACCESSMODE)),sysdaemon)
+        $(info Info: Compiling for ARMv7 architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
         BUILDDAEMON = false
         BUILDFREQ = false
     endif
     ifeq ($(strip $(ACCESSMODE)),accessdaemon)
-        $(info Info: Compiling for ARMv8. Changing accessmode to perf_event.)
+        $(info Info: Compiling for ARMv7 architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
         BUILDDAEMON = false
         BUILDFREQ = false
     endif
     ifeq ($(strip $(ACCESSMODE)),direct)
-        $(info Info: Compiling for ARMv8. Changing accessmode to perf_event.)
+        $(info Info: Compiling for ARMv7 architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
-        BUILDDAEMON = false
         BUILDFREQ = false
+        BUILDDAEMON = false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),perf_event)
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
     endif
 endif
 
-ifeq ($(strip $(COMPILER)),GCCARMv7)
+ifeq ($(strip $(COMPILER)),GCCPOWER)
     ifeq ($(strip $(ACCESSMODE)),sysdaemon)
-        $(info Info: Compiling for ARMv7. Changing accessmode to perf_event.)
+        $(info Info: Compiling for POWER architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
         BUILDDAEMON = false
         BUILDFREQ = false
     endif
     ifeq ($(strip $(ACCESSMODE)),accessdaemon)
-        $(info Info: Compiling for ARMv7. Changing accessmode to perf_event.)
+        $(info Info: Compiling for POWER architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
         BUILDDAEMON = false
         BUILDFREQ = false
     endif
     ifeq ($(strip $(ACCESSMODE)),direct)
-        $(info Info: Compiling for ARMv7. Changing accessmode to perf_event.)
+        $(info Info: Compiling for POWER architecture. Changing accessmode to perf_event.)
         ACCESSMODE := perf_event
         DEFINES += -DLIKWID_USE_PERFEVENT
         BUILDFREQ = false
         BUILDDAEMON = false
     endif
+    ifeq ($(strip $(ACCESSMODE)),perf_event)
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON := false
+        BUILDFREQ := false
+    endif
+endif
+ifeq ($(strip $(COMPILER)),GCCPOWER)
+    ifeq ($(strip $(ACCESSMODE)),sysdaemon)
+        $(info Info: Compiling for IBM POWER. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON = false
+        BUILDFREQ = false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),accessdaemon)
+        $(info Info: Compiling for IBM POWER. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON = false
+        BUILDFREQ = false
+    endif
+    ifeq ($(strip $(ACCESSMODE)),direct)
+        $(info Info: Compiling for IBM POWER. Changing accessmode to perf_event.)
+        ACCESSMODE := perf_event
+        DEFINES += -DLIKWID_USE_PERFEVENT
+        BUILDDAEMON = false
+        BUILDFREQ = false
+    endif
 endif
 
 ifeq ($(strip $(BUILDDAEMON)),true)
@@ -117,14 +196,19 @@ else
 endif
 
 ifeq ($(strip $(BUILDFREQ)),true)
-ifneq ($(strip $(COMPILER)),MIC)
-    FREQ_TARGET = likwid-setFreq
+    ifneq ($(strip $(COMPILER)),MIC)
+        FREQ_TARGET = likwid-setFreq
+    else
+        $(info Info: Compiling for Xeon Phi. Disabling build of likwid-setFreq.);
+        FREQ_TARGET =
+    endif
 else
-    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-setFreq.);
     FREQ_TARGET =
 endif
+ifeq ($(strip $(BUILDAPPDAEMON)),true)
+	APPDAEMON_TARGET = likwid-appDaemon.so
 else
-    FREQ_TARGET =
+	APPDAEMON_TARGET =
 endif
 
 ifeq ($(strip $(HAS_MEMPOLICY)),1)
@@ -140,10 +224,12 @@ LIBS += -L. -pthread -lm -ldl
 TARGET_LIB := $(DYNAMIC_TARGET_LIB)
 TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(SHARED_LIBHWLOC)
 TARGET_LUA_LIB=$(LUA_LIB_DIR)/$(SHARED_LIBLUA)
+TARGET_GOTCHA_LIB=$(GOTCHA_LIB_DIR)/$(SHARED_LIBGOTCHA)
 else
 TARGET_LIB := $(STATIC_TARGET_LIB)
 TARGET_HWLOC_LIB=$(HWLOC_FOLDER)/$(STATIC_LIBHWLOC)
 TARGET_LUA_LIB=$(LUA_LIB_DIR)/$(STATIC_LIBLUA)
+TARGET_GOTCHA_LIB=$(GOTCHA_LIB_DIR)/$(STATIC_LIBGOTCHA)
 endif
 
 ifeq ($(strip $(HAS_SCHEDAFFINITY)),1)
diff --git a/make/include_GCCPOWER.mk b/make/include_GCCPOWER.mk
new file mode 100644
index 000000000..7ded6995b
--- /dev/null
+++ b/make/include_GCCPOWER.mk
@@ -0,0 +1,33 @@
+CC  = gcc
+FC  = ifort
+AS  = as -mpower8
+AR  = ar
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
+
+ANSI_CFLAGS   =
+#ANSI_CFLAGS += -pedantic
+#ANSI_CFLAGS += -Wextra
+#ANSI_CFLAGS += -Wall
+
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
+FCFLAGS  = -module ./  # ifort
+#FCFLAGS  = -J ./  -fsyntax-only  #gfortran
+PASFLAGS  = ppc64
+ASFLAGS  = 
+CPPFLAGS =
+LFLAGS   =  -pthread 
+
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
+
+DEFINES  = -DPAGE_ALIGNMENT=4096
+DEFINES  += -DLIKWID_MONITOR_LOCK
+DEFINES  += -DDEBUGLEV=0
+
+INCLUDES =
+LIBS     = -lm -lrt
+
+
diff --git a/make/include_XLC.mk b/make/include_XLC.mk
new file mode 100644
index 000000000..cfa63836e
--- /dev/null
+++ b/make/include_XLC.mk
@@ -0,0 +1,33 @@
+CC  = xlc
+FC  = ifort
+AS  = as -mpower8
+AR  = ar
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
+
+ANSI_CFLAGS   =
+#ANSI_CFLAGS += -pedantic
+#ANSI_CFLAGS += -Wextra
+#ANSI_CFLAGS += -Wall
+
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
+FCFLAGS  = -module ./  # ifort
+#FCFLAGS  = -J ./  -fsyntax-only  #gfortran
+PASFLAGS  = ppc64
+ASFLAGS  = 
+CPPFLAGS =
+LFLAGS   =  -pthread 
+
+SHARED_CFLAGS = -fPIC -fvisibility=hidden
+SHARED_LFLAGS = -shared -fvisibility=hidden
+
+DEFINES  = -DPAGE_ALIGNMENT=4096
+DEFINES  += -DLIKWID_MONITOR_LOCK
+DEFINES  += -DDEBUGLEV=0
+
+INCLUDES =
+LIBS     = -lm -lrt
+
+
diff --git a/monitoring/README.agent b/monitoring/README.agent
deleted file mode 100644
index 756d01500..000000000
--- a/monitoring/README.agent
+++ /dev/null
@@ -1,66 +0,0 @@
-The likwid-agent application is a daemon that reads hardware performance
-counters in a periodic fashion. Which counters can be measured is determined by
-the system's CPU architecture. Each architecture has its own set of events and
-corresponding counter registers. For the measurement the likwid library is used
-and interfaced through the Lua interface. The measured values can be exported in
-multiple ways like RRD, syslog or gmetric from the Ganglia Monitoring System.
-
-
-The configuration file needs to be given at startup and has the following
-format:
-GROUPPATH <PATH_TO_GROUPS> # default is set during installation
-EVENTSET <SPACE_SEPARATED_LIST_OF_GROUPS>
-DURATION <TIME_IN_SECONDS_TO_MEASURE_EACH_GROUP>
-ACCESSMODE <0/1> # 0 is direct access, 1 forward access to the accessDaemon
-LOGPATH <PATH_TO_STORE_LOGFILES> # each montitoring group creates a logfile there named likwid.<GROUP>.log
-LOGSTYLE <log/update> # log appends new lines, update clears file previously
-GMETRIC <True/False> # send measured values to Gangla
-GMETRICPATH <PATH_TO_THE_GMETRIC_EXECUTABLE>
-GMETRICCONFIG <EXTRA_CONFIG_OPTIONS_TO_GMETRIC>
-RRD <True/False> # write measured values to RRD files, one RRD per group
-RRDPATH <PATH_TO_STORE_RRD_FILES>
-SYSLOG <True/False> # write measured values to syslog
-SYSLOGPRIO <prio> # Use priority level <prio> for syslog, default is local0.notice
-
-
-
-The group files cannot lie directly in GROUPPATH, you need to create a folder
-with the short name of the architecture like sandybridge or ivybridge. This
-enables to use the same group path distributed over a set of systems with different
-CPU architecture. The format of a group file is the following:
-SHORT <SHORT_NAME_OF_THE GROUP>
-
-EVENTSET // Starts event/counter definitions
-FIXC0 INSTR_RETIRED_ANY // Measure event INSTR_RETIRED_ANY in counter FIXC0
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS // Starts section of derived metrics and output items
-ONCE Runtime (RDTSC) [s] time # Output runtime only once
-MIN CPI FIXC1/FIXC0 # Output the minimum of the formula FIXC1/FIXC0 named CPI
-AVG CPI FIXC1/FIXC0 # Output the average of the same formula
-MAX L2 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time # Calculate bandwidth and output only the maximum
-MIN L2 load data volume [GBytes]  1.0E-09*PMC0*64.0
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time # Sum up all the values of all CPUs
-SUM L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
-
-LONG
-<LONG DESCRIPTION OF THE GROUP>
-
-Possible functions are:
-ONCE: Output only once (CPU core 0), no aggregation is done
-MIN: Output the minimum of all cores
-MAX: Output the maximum of all cores
-AVG: Output the average of all cores
-SUM: Output the sum of all cores' values
-If no function is set, the values of all HW threads is written to output and
-T<ID> is written in front of the name.
-
-The output metric names can be equal, the function is glued to the output name for later separation.
-
diff --git a/monitoring/groups/atom/BW_MEM.txt b/monitoring/groups/atom/BW_MEM.txt
deleted file mode 100644
index 8eb701f00..000000000
--- a/monitoring/groups/atom/BW_MEM.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-SHORT Memory bandwidth
-
-EVENTSET
-PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
-
-METRICS
-SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-
-
-LONG
diff --git a/monitoring/groups/atom/FLOPS_DP.txt b/monitoring/groups/atom/FLOPS_DP.txt
deleted file mode 100644
index 14961f066..000000000
--- a/monitoring/groups/atom/FLOPS_DP.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
-PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
-
-METRICS
-DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
-
-
-LONG
-Double Precision MFlops/s Double Precision MFlops/s
-
diff --git a/monitoring/groups/atom/FLOPS_SP.txt b/monitoring/groups/atom/FLOPS_SP.txt
deleted file mode 100644
index d67704f14..000000000
--- a/monitoring/groups/atom/FLOPS_SP.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
-PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
-
-METRICS
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-
-LONG
-Single Precision MFlops/s Double Precision MFlops/s
-
diff --git a/monitoring/groups/broadwell/BW.txt b/monitoring/groups/broadwell/BW.txt
deleted file mode 100644
index 3a2eb900d..000000000
--- a/monitoring/groups/broadwell/BW.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/broadwell/ENERGY.txt b/monitoring/groups/broadwell/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/broadwell/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/broadwell/FLOPS_DP.txt b/monitoring/groups/broadwell/FLOPS_DP.txt
deleted file mode 100644
index 53b2463aa..000000000
--- a/monitoring/groups/broadwell/FLOPS_DP.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
-PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
-PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
--
-AVX/SSE scalar and packed double precision flop rates.
-
diff --git a/monitoring/groups/broadwell/FLOPS_SP.txt b/monitoring/groups/broadwell/FLOPS_SP.txt
deleted file mode 100644
index b04f87aa9..000000000
--- a/monitoring/groups/broadwell/FLOPS_SP.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
-PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
-PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
--
-AVX/SSE scalar and packed single precision flop rates.
-
diff --git a/monitoring/groups/broadwellEP/BW.txt b/monitoring/groups/broadwellEP/BW.txt
deleted file mode 100644
index 3a2eb900d..000000000
--- a/monitoring/groups/broadwellEP/BW.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/broadwellEP/ENERGY.txt b/monitoring/groups/broadwellEP/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/broadwellEP/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/core2/BW_L2.txt b/monitoring/groups/core2/BW_L2.txt
deleted file mode 100644
index 6d73bf8c3..000000000
--- a/monitoring/groups/core2/BW_L2.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-SHORT Cache bandwidth
-
-EVENTSET
-PMC0  L1D_REPL
-PMC1  L1D_M_EVICT
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-
-
-LONG
diff --git a/monitoring/groups/core2/BW_MEM.txt b/monitoring/groups/core2/BW_MEM.txt
deleted file mode 100644
index 8eb701f00..000000000
--- a/monitoring/groups/core2/BW_MEM.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-SHORT Memory bandwidth
-
-EVENTSET
-PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
-
-METRICS
-SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-
-
-LONG
diff --git a/monitoring/groups/haswell/BW.txt b/monitoring/groups/haswell/BW.txt
deleted file mode 100644
index 3a2eb900d..000000000
--- a/monitoring/groups/haswell/BW.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/haswell/ENERGY.txt b/monitoring/groups/haswell/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/haswell/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/haswellEP/BW.txt b/monitoring/groups/haswellEP/BW.txt
deleted file mode 100644
index e6f4b73e6..000000000
--- a/monitoring/groups/haswellEP/BW.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-MBOX4C0 CAS_COUNT_RD
-MBOX4C1 CAS_COUNT_WR
-MBOX5C0 CAS_COUNT_RD
-MBOX5C1 CAS_COUNT_WR
-MBOX6C0 CAS_COUNT_RD
-MBOX6C1 CAS_COUNT_WR
-MBOX7C0 CAS_COUNT_RD
-MBOX7C1 CAS_COUNT_WR
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
-SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/haswellEP/ENERGY.txt b/monitoring/groups/haswellEP/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/haswellEP/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/interlagos/BW.txt b/monitoring/groups/interlagos/BW.txt
deleted file mode 100644
index 3f465f6c2..000000000
--- a/monitoring/groups/interlagos/BW.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  DATA_CACHE_REFILLS_ALL
-PMC1  DATA_CACHE_REFILLS_SYSTEM
-PMC2  L2_FILL_WB_FILL
-PMC3  L2_FILL_WB_WB
-UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
-UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0-PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/interlagos/CPI.txt b/monitoring/groups/interlagos/CPI.txt
deleted file mode 100644
index d599a346d..000000000
--- a/monitoring/groups/interlagos/CPI.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-SHORT  Cycles per instruction
-
-EVENTSET
-PMC0  RETIRED_INSTRUCTIONS
-PMC1  CPU_CLOCKS_UNHALTED
-PMC2  RETIRED_UOPS
-
-METRICS
-CPI   PMC1/PMC0
-Cycles per UOPS  PMC1/PMC2
-IPC   PMC0/PMC1
-
-LONG
-This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
-
diff --git a/monitoring/groups/interlagos/FLOPS.txt b/monitoring/groups/interlagos/FLOPS.txt
deleted file mode 100644
index 7bfb29a79..000000000
--- a/monitoring/groups/interlagos/FLOPS.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  RETIRED_FLOPS_DOUBLE_ALL
-PMC1  RETIRED_FLOPS_SINGLE_ALL
-
-METRICS
-DP MFlops/s    1.0E-06*(PMC0)/time
-SP MFlops/s    1.0E-06*(PMC1)/time
-
-LONG
-Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
--
-Profiling group to measure double precisision flop rate.
-
-
diff --git a/monitoring/groups/ivybridge/BW.txt b/monitoring/groups/ivybridge/BW.txt
deleted file mode 100644
index 3a2eb900d..000000000
--- a/monitoring/groups/ivybridge/BW.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/ivybridge/CYCLE_ACTIVITY.txt b/monitoring/groups/ivybridge/CYCLE_ACTIVITY.txt
deleted file mode 100644
index a4bf45da6..000000000
--- a/monitoring/groups/ivybridge/CYCLE_ACTIVITY.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-SHORT Cycle Activities
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-#PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
-PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
-#PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
-PMC1 CYCLE_ACTIVITY_STALLS_L2_PENDING
-#PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
-PMC0 CYCLE_ACTIVITY_STALLS_LDM_PENDING
-PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Cycles without execution [%] PMC3/FIXC1*100
-Cycles with stalls due to L1D [%] PMC2/FIXC1*100
-Cycles with stalls due to L2 [%] PMC1/FIXC1*100
-Cycles with stalls due to LDM [%] PMC0/FIXC1*100
-
-LONG
-Formulas
diff --git a/monitoring/groups/ivybridge/ENERGY.txt b/monitoring/groups/ivybridge/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/ivybridge/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/ivybridge/FLOPS_DP.txt b/monitoring/groups/ivybridge/FLOPS_DP.txt
deleted file mode 100644
index 496b8a54f..000000000
--- a/monitoring/groups/ivybridge/FLOPS_DP.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
--
-SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
-
diff --git a/monitoring/groups/ivybridge/FLOPS_SP.txt b/monitoring/groups/ivybridge/FLOPS_SP.txt
deleted file mode 100644
index 64edd199b..000000000
--- a/monitoring/groups/ivybridge/FLOPS_SP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
--
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/ivybridgeEP/BW.txt b/monitoring/groups/ivybridgeEP/BW.txt
deleted file mode 100644
index e6f4b73e6..000000000
--- a/monitoring/groups/ivybridgeEP/BW.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-MBOX4C0 CAS_COUNT_RD
-MBOX4C1 CAS_COUNT_WR
-MBOX5C0 CAS_COUNT_RD
-MBOX5C1 CAS_COUNT_WR
-MBOX6C0 CAS_COUNT_RD
-MBOX6C1 CAS_COUNT_WR
-MBOX7C0 CAS_COUNT_RD
-MBOX7C1 CAS_COUNT_WR
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
-SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/ivybridgeEP/CYCLE_ACTIVITY.txt b/monitoring/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
deleted file mode 100644
index a4bf45da6..000000000
--- a/monitoring/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-SHORT Cycle Activities
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-#PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
-PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
-#PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
-PMC1 CYCLE_ACTIVITY_STALLS_L2_PENDING
-#PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
-PMC0 CYCLE_ACTIVITY_STALLS_LDM_PENDING
-PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Cycles without execution [%] PMC3/FIXC1*100
-Cycles with stalls due to L1D [%] PMC2/FIXC1*100
-Cycles with stalls due to L2 [%] PMC1/FIXC1*100
-Cycles with stalls due to LDM [%] PMC0/FIXC1*100
-
-LONG
-Formulas
diff --git a/monitoring/groups/ivybridgeEP/ENERGY.txt b/monitoring/groups/ivybridgeEP/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/ivybridgeEP/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_DP.txt b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
deleted file mode 100644
index 496b8a54f..000000000
--- a/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
--
-SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
-
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_SP.txt b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
deleted file mode 100644
index 64edd199b..000000000
--- a/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
--
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/kabini/BW.txt b/monitoring/groups/kabini/BW.txt
deleted file mode 100644
index 7e340787a..000000000
--- a/monitoring/groups/kabini/BW.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  DATA_CACHE_REFILLS_ALL
-PMC1  DATA_CACHE_EVICTED_ALL
-UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
-UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
-
-
-METRICS
-SUM L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
-SUM Memory bandwidth [MBytes/s]   1.0E-06*(UPMC0+UPMC1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/kabini/FLOPS.txt b/monitoring/groups/kabini/FLOPS.txt
deleted file mode 100644
index ccb2f92ef..000000000
--- a/monitoring/groups/kabini/FLOPS.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  RETIRED_FLOPS_DOUBLE_ALL
-PMC1  RETIRED_FLOPS_SINGLE_ALL
-
-METRICS
-DP MFlops/s    1.0E-06*(PMC0)/time
-SP MFlops/s    1.0E-06*(PMC1)/time
-
-LONG
-Formulas:
-DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
-SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
diff --git a/monitoring/groups/nehalem/BW.txt b/monitoring/groups/nehalem/BW.txt
deleted file mode 100644
index ddc8c8240..000000000
--- a/monitoring/groups/nehalem/BW.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  L1D_REPL
-PMC1  L1D_M_EVICT
-PMC2  L2_LINES_IN_ANY
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-UPMC0  UNC_QMC_NORMAL_READS_ANY
-UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
-
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
-SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/nehalem/CPI.txt b/monitoring/groups/nehalem/CPI.txt
deleted file mode 100644
index 9852da84e..000000000
--- a/monitoring/groups/nehalem/CPI.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Cycles per instruction
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-
-METRICS
-CPI  FIXC1/FIXC0
-IPC  FIXC0/FIXC1
-
-
-LONG
-CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
-IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/nehalem/FLOPS.txt b/monitoring/groups/nehalem/FLOPS.txt
deleted file mode 100644
index e37250412..000000000
--- a/monitoring/groups/nehalem/FLOPS.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-
-METRICS
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/nehalemEX/BW.txt b/monitoring/groups/nehalemEX/BW.txt
deleted file mode 100644
index 473ce769e..000000000
--- a/monitoring/groups/nehalemEX/BW.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  L1D_REPL
-PMC1  L1D_M_EVICT
-PMC2  L2_LINES_IN_ANY
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS
-MBOX0C1 DRAM_CMD_CAS_WR_OPN
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS
-MBOX1C1 DRAM_CMD_CAS_WR_OPN
-
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
-
-LONG
-Formula:
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time
-
-On Nehalem EX it is not possible to measure the write operations with the
-FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
-because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
-it only measures write operations to open pages, hence writes to closed pages
-are not included here.
diff --git a/monitoring/groups/nehalemEX/CPI.txt b/monitoring/groups/nehalemEX/CPI.txt
deleted file mode 100644
index 0e4faa3da..000000000
--- a/monitoring/groups/nehalemEX/CPI.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-SHORT Cycles per instruction
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-
-METRICS
-CPI  FIXC1/FIXC0
-
-
-LONG
-CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/nehalemEX/FLOPS.txt b/monitoring/groups/nehalemEX/FLOPS.txt
deleted file mode 100644
index e37250412..000000000
--- a/monitoring/groups/nehalemEX/FLOPS.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-
-METRICS
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/pentiumm/BW.txt b/monitoring/groups/pentiumm/BW.txt
deleted file mode 100644
index 5877abc07..000000000
--- a/monitoring/groups/pentiumm/BW.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  L2_LINES_IN_ALL_ALL
-PMC1  L2_LINES_OUT_ALL_ALL
-
-METRICS
-L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-
-LONG
-Formulas:
-L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
diff --git a/monitoring/groups/pentiumm/CPI.txt b/monitoring/groups/pentiumm/CPI.txt
deleted file mode 100644
index fb0d97b2e..000000000
--- a/monitoring/groups/pentiumm/CPI.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-SHORT  Cycles per instruction
-
-EVENTSET
-PMC0  UOPS_RETIRED
-PMC1  CPU_CLK_UNHALTED
-
-METRICS
-CPI   PMC1/PMC0
-IPC   PMC0/PMC1
-
-LONG
-This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is UOPS_RETIRED as it tells you how many uops
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
-
diff --git a/monitoring/groups/phi/CPI.txt b/monitoring/groups/phi/CPI.txt
deleted file mode 100644
index 0ce61cd3f..000000000
--- a/monitoring/groups/phi/CPI.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-SHORT  Cycles per instruction
-
-EVENTSET
-PMC0  INSTRUCTIONS_EXECUTED
-PMC1  CPU_CLK_UNHALTED
-
-METRICS
-CPI   PMC1/PMC0
-IPC   PMC0/PMC1
-
-LONG
-This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
-
diff --git a/monitoring/groups/sandybridge/BW.txt b/monitoring/groups/sandybridge/BW.txt
deleted file mode 100644
index 3a2eb900d..000000000
--- a/monitoring/groups/sandybridge/BW.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/sandybridge/CYCLE_ACTIVITY.txt b/monitoring/groups/sandybridge/CYCLE_ACTIVITY.txt
deleted file mode 100644
index 40abcb63e..000000000
--- a/monitoring/groups/sandybridge/CYCLE_ACTIVITY.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT Cycle Activities
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-#PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
-PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
-#PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
-PMC1 CYCLE_ACTIVITY_STALLS_L2_PENDING
-PMC3 CYCLE_ACTIVITY_CYCLES_NO_DISPATCH
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Cycles without execution [%] PMC3/FIXC1*100
-Cycles with stalls due to L1D [%] PMC2/FIXC1*100
-Cycles with stalls due to L2 [%] PMC1/FIXC1*100
-
-LONG
-Formulas
diff --git a/monitoring/groups/sandybridge/ENERGY.txt b/monitoring/groups/sandybridge/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/sandybridge/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/sandybridge/FLOPS_DP.txt b/monitoring/groups/sandybridge/FLOPS_DP.txt
deleted file mode 100644
index c004b884f..000000000
--- a/monitoring/groups/sandybridge/FLOPS_DP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
--
-SSE scalar and packed double precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/sandybridge/FLOPS_SP.txt b/monitoring/groups/sandybridge/FLOPS_SP.txt
deleted file mode 100644
index f9e6df76d..000000000
--- a/monitoring/groups/sandybridge/FLOPS_SP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
--
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on SandyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/sandybridgeEP/BW.txt b/monitoring/groups/sandybridgeEP/BW.txt
deleted file mode 100644
index 18eea4ff2..000000000
--- a/monitoring/groups/sandybridgeEP/BW.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  L1D_REPLACEMENT
-PMC1  L2_TRANS_L1D_WB
-PMC2  L2_LINES_IN_ALL
-PMC3  L2_LINES_OUT_DEMAND_DIRTY
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/sandybridgeEP/CYCLE_ACTIVITY.txt b/monitoring/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
deleted file mode 100644
index 40abcb63e..000000000
--- a/monitoring/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-SHORT Cycle Activities
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-#PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
-PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
-#PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
-PMC1 CYCLE_ACTIVITY_STALLS_L2_PENDING
-PMC3 CYCLE_ACTIVITY_CYCLES_NO_DISPATCH
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Cycles without execution [%] PMC3/FIXC1*100
-Cycles with stalls due to L1D [%] PMC2/FIXC1*100
-Cycles with stalls due to L2 [%] PMC1/FIXC1*100
-
-LONG
-Formulas
diff --git a/monitoring/groups/sandybridgeEP/ENERGY.txt b/monitoring/groups/sandybridgeEP/ENERGY.txt
deleted file mode 100644
index 7256f1ed3..000000000
--- a/monitoring/groups/sandybridgeEP/ENERGY.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-PWR3 PWR_DRAM_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-SUM Total Power DRAM [W] PWR3/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_DP.txt b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
deleted file mode 100644
index c004b884f..000000000
--- a/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Double Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
--
-SSE scalar and packed double precision flop rates. Please note that the current
-flop measurements on IvyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_SP.txt b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
deleted file mode 100644
index f9e6df76d..000000000
--- a/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-SHORT Single Precision MFlops/s
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-
-METRICS
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
-AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-
-LONG
-Formula:
-MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
-AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
-Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
--
-SSE scalar and packed single precision flop rates. Please note that the current
-flop measurements on SandyBridge are potentially wrong. So you cannot trust
-these counters at the moment!
-
diff --git a/monitoring/groups/silvermont/BW.txt b/monitoring/groups/silvermont/BW.txt
deleted file mode 100644
index 952e64af0..000000000
--- a/monitoring/groups/silvermont/BW.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-SHORT Cache and memory bandwidths
-
-EVENTSET
-PMC0  LONGEST_LAT_CACHE_MISS
-PMC1  OFFCORE_RESPONSE_1_WB_ANY
-
-METRICS
-SUM Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-SUM Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-
-LONG
diff --git a/monitoring/groups/silvermont/CPI.txt b/monitoring/groups/silvermont/CPI.txt
deleted file mode 100644
index 4eb4d40c4..000000000
--- a/monitoring/groups/silvermont/CPI.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Cycles per instruction
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-
-METRICS
-CPI FIXC0/FIXC1
-IPC FIXC1/FIXC0
-
-LONG
-CPI = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
-IPC = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/silvermont/ENERGY.txt b/monitoring/groups/silvermont/ENERGY.txt
deleted file mode 100644
index 3814560be..000000000
--- a/monitoring/groups/silvermont/ENERGY.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-SHORT Energy, CPI and Clock
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PWR0 PWR_PKG_ENERGY
-
-METRICS
-ONCE Runtime (RDTSC) [s] time
-SUM Retired instructions FIXC0
-SUM Total Power [W] PWR0/time
-CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI FIXC0/FIXC1
-
-LONG
diff --git a/monitoring/groups/westmere/BW.txt b/monitoring/groups/westmere/BW.txt
deleted file mode 100644
index 492507796..000000000
--- a/monitoring/groups/westmere/BW.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  L1D_REPL
-PMC1  L1D_M_EVICT
-PMC2  L2_LINES_IN_ANY
-PMC3  L2_LINES_OUT_ANY
-UPMC0  UNC_QMC_NORMAL_READS_ANY
-UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
-SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
-
-LONG
diff --git a/monitoring/groups/westmere/CPI.txt b/monitoring/groups/westmere/CPI.txt
deleted file mode 100644
index 9852da84e..000000000
--- a/monitoring/groups/westmere/CPI.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Cycles per instruction
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-
-METRICS
-CPI  FIXC1/FIXC0
-IPC  FIXC0/FIXC1
-
-
-LONG
-CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
-IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmere/FLOPS.txt b/monitoring/groups/westmere/FLOPS.txt
deleted file mode 100644
index e37250412..000000000
--- a/monitoring/groups/westmere/FLOPS.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-
-METRICS
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/westmereEX/BW.txt b/monitoring/groups/westmereEX/BW.txt
deleted file mode 100644
index a960025cb..000000000
--- a/monitoring/groups/westmereEX/BW.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Cache and memory bandwidth
-
-EVENTSET
-PMC0  L1D_REPL
-PMC1  L1D_M_EVICT
-PMC2  L2_LINES_IN_ANY
-PMC3  L2_LINES_OUT_ANY
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS
-MBOX0C1 DRAM_CMD_CAS_WR_OPN
-MBOX0C2 DRAM_MISC_CAS_WR_CLS
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS
-MBOX1C1 DRAM_CMD_CAS_WR_OPN
-MBOX1C2 DRAM_MISC_CAS_WR_CLS
-
-METRICS
-SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
-SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
-
-LONG
diff --git a/monitoring/groups/westmereEX/CPI.txt b/monitoring/groups/westmereEX/CPI.txt
deleted file mode 100644
index 9852da84e..000000000
--- a/monitoring/groups/westmereEX/CPI.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-SHORT Cycles per instruction
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-
-METRICS
-CPI  FIXC1/FIXC0
-IPC  FIXC0/FIXC1
-
-
-LONG
-CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
-IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmereEX/FLOPS.txt b/monitoring/groups/westmereEX/FLOPS.txt
deleted file mode 100644
index e37250412..000000000
--- a/monitoring/groups/westmereEX/FLOPS.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-SHORT Floating point operations
-
-EVENTSET
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-
-METRICS
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-
-LONG
-Formulas:
-Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
-Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
-SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
-DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/likwid-agent.conf b/monitoring/likwid-agent.conf
deleted file mode 100644
index 2af8c821f..000000000
--- a/monitoring/likwid-agent.conf
+++ /dev/null
@@ -1,55 +0,0 @@
-### Global section ###
-
-# Set path to monitoring group files. Default is the normal LIKWID group path
-# <INSTALLEDPREFIX>/share/likwid/mongroups
-#GROUPPATH <path_to_mon_groups>
-# List of monitoring groups that should be measured
-#EVENTSET <group1> <group2> ...
-# Define access mode for LIKWID. If likwid-agent runs as root, use 0 for direct
-# access to the MSR and PCI registers. If you are running it as common user, you
-# have to select 1 to use the accessDaemon of LIKWID. Default is 1.
-#ACCESSMODE <0/1>
-# Define the time in seconds that each given monitoring group should be measured
-#DURATION 1
-
-
-### Output section ###
-
-## Simple logfile output ##
-# Specify path for the logfile. For each monitoring group a own logfile is
-# created with the format likwid.<group>.log
-#LOGPATH <path>
-# Specify the logfile writing style. The two possible options are log and
-# update.
-# log appends all new messages to the logfile, while update empties the logfile
-# before performing any writing. The update option is recommended when the
-# output is further parsed with other tools. If LOGPATH is set but no LOGSTYLE
-# set, the style log is selected.
-#LOGSTYLE <log/update>
-
-## Syslog output ##
-# De/Activate the output to the syslog system using shell tool logger
-#SYSLOG <True/False>
-# Define the priority value for logger. Default priority is local0.notice.
-#SYSLOGPRIO local0.notice
-
-## RRD output ##
-# Likwid-agent tries to create basic RRD configurations for the selected
-# groups. Each monitoring group gets its own RRD file containing all metrics
-# as data sources. For better printing, RRAs are created to hold the min, max
-# and average values for every 10 minutes in the last hour, every hour for the
-# last day and every day for the last month.
-#RRD <True/False>
-# Store the RRDs in RRDPATH
-#RRDPATH <path>
-
-## GMetric output ##
-# De/Activate the output to the Ganglia Monitoring System using the gmetric tool
-#GMETRIC <True/False>
-# Set path to the executable of gmetric.
-#GMETRICPATH <path_to_gmetric>
-# In some environments they need to hand over a special config file for gmetric.
-#GMETRICCONFIG <path_to_gmetric_config>
-# If you want to send a specific group with gmetric. If nothing is set but gmetric
-# has a --group option, the name of the monitoring groups is used.
-#GMETRICGROUP <groupname>
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index 8470de884..576e57399 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -31,6 +31,8 @@ include  ../../make/include_$(COMPILER).mk
 
 DAEMON_TARGET = likwid-accessD
 SETFREQ_TARGET = likwid-setFreq
+APPDAEMON_TARGET = likwid-appDaemon.so
+GOTCHA_FOLDER = ../../ext/GOTCHA
 Q         ?= @
 
 DEFINES   += -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -DMAX_NUM_NODES=$(MAX_NUM_NODES) -DLIKWIDLOCK=$(LIKWIDLOCKPATH) -DLIKWIDSOCKETBASE=$(LIKWIDSOCKETBASE)
@@ -44,12 +46,14 @@ CPPFLAGS :=  $(DEFINES) $(INCLUDES)
 ifeq ($(COMPILER),GCCARMv8)
 all:
 else
-all: $(DAEMON_TARGET) $(SETFREQ_TARGET)
+all: $(DAEMON_TARGET) $(SETFREQ_TARGET) $(APPDAEMON_TARGET)
+endif
 
 $(DAEMON_TARGET): accessDaemon.c
 	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
 
-$(SETFREQ_TARGET): setFreq.c setFreq_cpufreq.c setFreq_pstate.c
-	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -I. -o ../../$(SETFREQ_TARGET) setFreq.c setFreq_cpufreq.c setFreq_pstate.c
+$(SETFREQ_TARGET): setFreqDaemon.c
+	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreqDaemon.c
 
-endif
+$(APPDAEMON_TARGET): $(GOTCHA_TARGET) appDaemon.c
+	$(Q)$(CC) -shared -fPIC $(CPPFLAGS) -Wl,-soname,$(APPDAEMON_TARGET).$(VERSION).$(RELEASE) -fstack-protector -I. -I$(GOTCHA_FOLDER)/include  -L$(GOTCHA_FOLDER) appDaemon.c -o ../../$(APPDAEMON_TARGET)  -llikwid-gotcha
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index 371e64c14..11c87a627 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -772,11 +772,11 @@ static int allowed_skx(uint32_t reg)
         return 1;
     else
     {
-        syslog(LOG_ERR, "Testing 0x%X %d %d\n", reg, (reg & 0xF00U), (reg & 0xA00U));
         if (((reg & 0xF00U) == 0x700U) ||
             ((reg & 0xF00U) == 0xE00U) ||
             ((reg & 0xF00U) == 0xF00U) ||
             (reg == MSR_PREFETCH_ENABLE) ||
+            (reg == TSX_FORCE_ABORT) ||
             ((reg & 0xA00U) == 0xA00U))
             return 1;
     }
diff --git a/src/access-daemon/appDaemon.c b/src/access-daemon/appDaemon.c
new file mode 100644
index 000000000..c2e7139cc
--- /dev/null
+++ b/src/access-daemon/appDaemon.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <gotcha/gotcha.h>
+
+gotcha_wrappee_handle_t orig_main_handle;
+
+static int appDaemon_initialized = 0;
+
+int likwid_appDaemon_main(int argc, char** argv)
+{
+    int return_code = 0;
+    typeof(&likwid_appDaemon_main) orig_main = (int (*)(int, char**))gotcha_get_wrappee(orig_main_handle);
+    char* nvEventStr = getenv("NVMON_EVENTS");
+    char* nvGpuStr = getenv("NVMON_GPUS");
+
+    if (appDaemon_initialized)
+    {
+        return_code = orig_main(argc, argv);
+    }
+    else
+    {
+
+        appDaemon_initialized = 1;
+
+
+        return_code = orig_main(argc, argv);
+    }
+
+
+
+
+
+
+    appDaemon_initialized = 0;
+    return return_code;
+}
+
+
+struct gotcha_binding_t likwid_appDaemon_overwrites[] = {
+  {"main", likwid_appDaemon_main, (void*)&orig_main_handle},
+};
+
+
+void __attribute__((constructor)) likwid_appDaemon_constructor()
+{
+    gotcha_wrap(likwid_appDaemon_overwrites, 1 ,"likwid_appDaemon");
+}
diff --git a/src/access-daemon/setFreq.h b/src/access-daemon/setFreq.h
deleted file mode 100644
index 2cc309e5a..000000000
--- a/src/access-daemon/setFreq.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  setFreq.h
- *
- *      Description:  Header for frequency daemon
- *
- *      Version:   4.3.2
- *      Released:  12.04.2018
- *
- *      Authors:  Thomas Roehl (tr), thomas.roehl@googlemail.com
- *
- *      Project:  likwid
- *
- *      Copyright (C) 2018 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef SETFREQ_DAEMON
-#define SETFREQ_DAEMON
-
-#if defined(__i386__) && defined(__PIC__)
-# define EBX_REG "=r"
-#else
-# define EBX_REG "=b"
-#endif
-
-#ifndef __clang__
-#define CPUID(eax,ebx,ecx,edx)                            \
-    __asm__ volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t" \
-                     "cpuid                                     \n\t" \
-                     ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t" \
-                     : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) \
-                     : "a" (eax), "c" (ecx) \
-                     )
-#else
-#define CPUID(eax,ebx,ecx,edx)         \
-    __asm__ volatile("cpuid" : "=a" (eax), "=c" (ecx), "=d" (edx), EBX_REG (ebx) : "a" (eax), "c" (ecx) );
-#endif
-
-extern int do_pstate (int argn, char** argv);
-extern int do_cpufreq (int argn, char** argv);
-
-#endif
diff --git a/src/access-daemon/setFreqDaemon.c b/src/access-daemon/setFreqDaemon.c
new file mode 100644
index 000000000..a87a512be
--- /dev/null
+++ b/src/access-daemon/setFreqDaemon.c
@@ -0,0 +1,816 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  accessDaemon.c
+ *
+ *      Description:  Implementation of access daemon.
+ *
+ *      Version:   4.3.1
+ *      Released:  04.01.2018
+ *
+ *      Authors:  Michael Meier, michael.meier@rrze.fau.de
+ *                Jan Treibig (jt), jan.treibig@gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2018 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <syslog.h>
+#include <signal.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/fsuid.h>
+#include <getopt.h>
+#include <dirent.h>
+#include <sys/mman.h>
+
+
+#include <lock.h>
+//#include <error.h>
+#include <frequency_client.h>
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define SA struct sockaddr
+#define str(x) #x
+
+#define CHECK_FILE_ERROR(func, msg)  \
+    if ((func) == 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
+
+#define LOG_AND_EXIT_IF_ERROR(func, msg)  \
+    if ((func) < 0) {  \
+        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
+        exit(EXIT_FAILURE); \
+    }
+
+#define CHECK_ERROR(func, msg)  \
+    if ((func) < 0) { \
+        fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno));  \
+    }
+
+#define PCI_ROOT_PATH    "/proc/bus/pci/"
+
+//#define MAX_NUM_NODES    4
+
+
+
+/* Lock file controlled from outside which prevents likwid to start.
+ * Can be used to synchronize access to the hardware counters
+ * with an external monitoring system. */
+
+/* #####   TYPE DEFINITIONS   ########### */
+
+
+struct cpufreq_files {
+    int  cur_freq;
+    int  max_freq;
+    int  min_freq;
+    int  avail_freq;
+    int  avail_govs;
+    int  driver;
+    int  set_freq;
+    int  set_gov;
+};
+
+char* cpufreq_files[] ={
+    "scaling_cur_freq",
+    "scaling_max_freq",
+    "scaling_min_freq",
+    "scaling_available_frequencies",
+    "scaling_available_governors",
+    "scaling_driver",
+    "scaling_setspeed",
+    "scaling_governor",
+    NULL,
+};
+
+char* pstate_files[] ={
+    "scaling_cur_freq",
+    "scaling_max_freq",
+    "scaling_min_freq",
+    "scaling_available_frequencies",
+    "scaling_available_governors",
+    "scaling_driver",
+    "scaling_setspeed",
+    "scaling_governor",
+    NULL,
+};
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static int sockfd = -1;
+static int connfd = -1; /* temporary in to make it compile */
+static char* filepath;
+static const char* ident = "setFreqD";
+static int avail_cpus = 0;
+static struct cpufreq_files* cpufiles = NULL;
+static char** avail_freqs = NULL;
+static int avail_freqs_count = 0;
+static int no_avail_freqs = 0;
+static char* avail_govs = NULL;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+static int get_avail_cpus(void)
+{
+    FILE *fpipe = NULL;
+    char *ptr = NULL;
+    char cmd_cpu[] = "cat /proc/cpuinfo  | grep 'processor' | sort -u | wc -l";
+    char buff[256];
+
+    if ( !(fpipe = (FILE*)popen(cmd_cpu,"r")) )
+    {  // If fpipe is NULL
+        return -errno;
+    }
+    ptr = fgets(buff, 256, fpipe);
+    if (pclose(fpipe))
+        return -errno;
+    return atoi(buff);
+}
+
+static int is_gov_valid(int len, char* data)
+{
+    if (avail_govs == NULL)
+    {
+        int fd = 0;
+        char buff[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+        char *filename = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors";
+        if (!access(filename, R_OK))
+        {
+            fd = open(filename, O_RDONLY);
+            if (fd > 0)
+            {
+                int ret = read(fd, buff, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1);
+                if (ret > 0)
+                {
+                    buff[ret] = '\0';
+                    avail_govs = malloc((strlen(buff)+2)*sizeof(char));
+                    if (avail_govs)
+                    {
+                        ret = snprintf(avail_govs, strlen(buff)+1, "%s", buff);
+                        if (ret > 0)
+                        {
+                            avail_govs[ret] = '\0';
+                        }
+                    }
+                }
+                close(fd);
+            }
+        }
+    }
+    return strstr(avail_govs, data) != NULL;
+}
+
+
+static int is_freq_valid(int len, char* data)
+{
+    int i = 0;
+    if (avail_freqs == NULL)
+    {
+        int fd = 0;
+        char buff[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+        char *filename = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies";
+        if (!access(filename, R_OK))
+        {
+            fd = open(filename, O_RDONLY);
+            if (fd > 0)
+            {
+                int ret = read(fd, buff, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1);
+                if (ret > 0)
+                {
+                    int count = 0;
+                    buff[ret] = '\0';
+                    for (i = 0; i < strlen(buff); i++)
+                    {
+                        if (buff[i] == '\n') break;
+                        if (buff[i] == ' ') count++;
+                    }
+                    avail_freqs = malloc((count+2) * sizeof(char*));
+                    if (avail_freqs)
+                    {
+                        char* token = strtok(buff, " ");
+                        count = 0;
+                        while (token != NULL) {
+                            avail_freqs[count] = malloc((strlen(token)+2) * sizeof(char));
+                            if (avail_freqs[count])
+                            {
+                                ret = snprintf(avail_freqs[count], strlen(token)+1, "%s", token);
+                                if (ret > 0)
+                                {
+                                    avail_freqs[count][ret] = '\0';
+                                    count++;
+                                }
+                                token = strtok(NULL, " ");
+                                if (token && token[0] == '\n') { break; }
+                            }
+                        }
+                        avail_freqs_count = count;
+                    }
+                    close(fd);
+                }
+            }
+        }
+        else
+        {
+            no_avail_freqs = 1;
+            return 1;
+        }
+    }
+    for (i = 0; i < avail_freqs_count; i++)
+    {
+        if (strncmp(avail_freqs[i], data, strlen(avail_freqs[i])) == 0)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+static void close_cpu(struct cpufreq_files* cpufiles)
+{
+    if (cpufiles)
+    {
+/*        if (cpufiles->cur_freq >= 0)*/
+/*        {*/
+/*            syslog(LOG_INFO, "Close cur_freq %d\n", cpufiles->cur_freq);*/
+/*            close(cpufiles->cur_freq);*/
+/*            cpufiles->cur_freq = -1;*/
+/*        }*/
+        if (cpufiles->max_freq >= 0)
+        {
+            //syslog(LOG_INFO, "Close max_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->max_freq);
+            cpufiles->max_freq = -1;
+        }
+        if (cpufiles->min_freq >= 0)
+        {
+            //syslog(LOG_INFO, "Close min_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->min_freq);
+            cpufiles->min_freq = -1;
+        }
+        if (cpufiles->set_freq >= 0)
+        {
+            //syslog(LOG_INFO, "Close set_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->set_freq);
+            cpufiles->set_freq = -1;
+        }
+        if (cpufiles->set_gov >= 0)
+        {
+            //syslog(LOG_INFO, "Close set_gov %d\n", cpufiles->cur_freq);
+            close(cpufiles->set_gov);
+            cpufiles->set_gov = -1;
+        }
+/*        if (cpufiles->avail_freq >= 0)*/
+/*        {*/
+/*            syslog(LOG_INFO, "Close avail_freq %d\n", cpufiles->cur_freq);*/
+/*            close(cpufiles->avail_freq);*/
+/*            cpufiles->avail_freq = -1;*/
+/*        }*/
+/*        if (cpufiles->avail_govs >= 0)*/
+/*        {*/
+/*            syslog(LOG_INFO, "Close avail_govs %d\n", cpufiles->cur_freq);*/
+/*            close(cpufiles->avail_govs);*/
+/*            cpufiles->avail_govs = -1;*/
+/*        }*/
+/*        if (cpufiles->driver >= 0)*/
+/*        {*/
+/*            syslog(LOG_INFO, "Close driver %d\n", cpufiles->cur_freq);*/
+/*            close(cpufiles->driver);*/
+/*            cpufiles->driver = -1;*/
+/*        }*/
+        free(cpufiles);
+    }
+}
+
+static int open_cpu_file(char* filename, int* fd)
+{
+    int f = -1;
+    int access_flag = R_OK|W_OK;
+    int open_flag = O_RDWR;
+
+    f = open(filename, open_flag);
+    if (f < 0)
+    {
+        syslog(LOG_ERR, "Failed to open file %s \n", filename);
+        *fd = -1;
+        return 0;
+    }
+    *fd = f;
+#ifdef DEBUG_LIKWID
+    syslog(LOG_INFO, "Opened %s %s = %d\n", filename, (open_flag == O_RDONLY ? "readable" : "writable"), *fd);
+#endif
+    return 0;
+}
+
+static int open_cpu(int cpu, struct cpufreq_files* files)
+{
+    char dname[1025];
+    char fname[1025];
+    //struct cpufreq_files* files;
+    FILE* fp = NULL;
+    if (cpu >= 0)
+    {
+        int ret = snprintf(dname, 1024, "/sys/devices/system/cpu/cpu%d/cpufreq", cpu);
+        if (ret > 0)
+        {
+            dname[ret] = '\0';
+        }
+        else
+        {
+            return -1;
+        }
+
+
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_cur_freq");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->cur_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_max_freq");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->max_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_min_freq");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->min_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_setspeed");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->set_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_governor");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->set_gov) < 0)
+            {
+                goto cleanup;
+            }
+        }
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_available_governors");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->avail_govs) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_available_frequencies");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->avail_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_driver");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->driver) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+        return 0;
+    }
+cleanup:
+    //syslog(LOG_ERR, "Cleanup\n");
+    close_cpu(files);
+    return -1;
+}
+
+
+static void
+kill_client(void)
+{
+    if (connfd != -1)
+    {
+        CHECK_ERROR(close(connfd), socket close failed);
+    }
+
+    connfd = -1;
+}
+
+static void
+stop_daemon(void)
+{
+    kill_client();
+
+    if (sockfd != -1)
+    {
+        CHECK_ERROR(close(sockfd), socket close sockfd failed);
+    }
+
+    free(filepath);
+    if (avail_freqs != NULL)
+    {
+        for (int i=0; i < avail_freqs_count; i++)
+        {
+            if (avail_freqs[i] != NULL)
+                free(avail_freqs[i]);
+        }
+        free(avail_freqs);
+        avail_freqs = NULL;
+        avail_freqs_count = 0;
+    }
+    if (avail_govs != NULL)
+    {
+        free(avail_govs);
+        avail_govs = NULL;
+    }
+    if (cpufiles != NULL)
+    {
+        for (int i=0;i<avail_cpus;i++)
+        {
+            close_cpu(&cpufiles[i]);
+        }
+        free(cpufiles);
+        cpufiles = NULL;
+    }
+    closelog();
+    exit(EXIT_SUCCESS);
+}
+
+
+
+static void
+Signal_Handler(int sig)
+{
+    if (sig == SIGPIPE)
+    {
+        syslog(LOG_NOTICE, "SIGPIPE? client crashed?!");
+        stop_daemon();
+    }
+
+    /* For SIGALRM we just return - we're just here to create a EINTR */
+    if (sig == SIGTERM)
+    {
+        stop_daemon();
+    }
+}
+
+static void
+daemonize(int* parentPid)
+{
+    pid_t pid, sid;
+
+    *parentPid = getpid();
+
+    /* already a daemon */
+    if ( getppid() == 1 ) return;
+
+    /* Fork off the parent process */
+    pid = fork();
+
+    if (pid < 0)
+    {
+        syslog(LOG_ERR, "fork failed: %s", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    /* If we got a good PID, then we can exit the parent process. */
+    if (pid > 0)
+    {
+        exit(EXIT_SUCCESS);
+    }
+
+    /* At this point we are executing as the child process */
+
+    /* Create a new SID for the child process */
+    sid = setsid();
+
+    if (sid < 0)
+    {
+        syslog(LOG_ERR, "setsid failed: %s", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    /* Change the current working directory.  This prevents the current
+       directory from being locked; hence not being able to remove it. */
+    if ((chdir("/")) < 0)
+    {
+        syslog(LOG_ERR, "chdir failed:  %s", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    /* Redirect standard files to /dev/null */
+    {
+        CHECK_FILE_ERROR(freopen( "/dev/null", "r", stdin), freopen stdin failed);
+        CHECK_FILE_ERROR(freopen( "/dev/null", "w", stdout), freopen stdout failed);
+        CHECK_FILE_ERROR(freopen( "/dev/null", "w", stderr), freopen stderr failed);
+    }
+}
+
+static int freq_read(FreqDataRecord *rec)
+{
+    int read_fd = -1;
+    int cpu = rec->cpu;
+    struct cpufreq_files* f = &cpufiles[cpu];
+    switch(rec->loc)
+    {
+        case FREQ_LOC_CUR:
+            read_fd = f->cur_freq;
+            break;
+        case FREQ_LOC_MIN:
+            read_fd = f->min_freq;
+            break;
+        case FREQ_LOC_MAX:
+            read_fd = f->max_freq;
+            break;
+        case FREQ_LOC_GOV:
+            read_fd = f->set_gov;
+            break;
+    }
+    if (read_fd < 0)
+    {
+        rec->errorcode = FREQ_ERR_NOFILE;
+        return -1;
+    }
+    rec->data[0] = '\0';
+    int ret = read(read_fd, rec->data, LIKWID_FREQUENCY_MAX_DATA_LENGTH);
+    if (ret < 0)
+    {
+        rec->data[0] = '\0';
+        rec->errorcode = FREQ_ERR_NOPERM;
+    }
+    else
+    {
+        rec->errorcode = FREQ_ERR_NONE;
+        rec->data[ret] = '\0';
+    }
+    return 0;
+}
+
+static int freq_write(FreqDataRecord *rec)
+{
+    int write_fd = -1;
+    int cpu = rec->cpu;
+    int check_freq = 0;
+    int check_gov = 0;
+    struct cpufreq_files* f = &cpufiles[cpu];
+    
+    switch(rec->loc)
+    {
+        case FREQ_LOC_CUR:
+            write_fd = f->cur_freq;
+            check_freq = 1;
+#ifdef DEBUG_LIKWID
+            syslog(LOG_INFO, "CMD WRITE CPU %d FREQ_LOC_CUR %d", cpu, write_fd);
+#endif
+            break;
+        case FREQ_LOC_MIN:
+            write_fd = f->min_freq;
+            check_freq = 1;
+#ifdef DEBUG_LIKWID
+            syslog(LOG_INFO, "CMD WRITE CPU %d FREQ_LOC_MIN %d", cpu, write_fd);
+#endif
+            break;
+        case FREQ_LOC_MAX:
+            write_fd = f->max_freq;
+            check_freq = 1;
+#ifdef DEBUG_LIKWID
+            syslog(LOG_INFO, "CMD WRITE CPU %d FREQ_LOC_MAX %d", cpu, write_fd);
+#endif
+            break;
+        case FREQ_LOC_GOV:
+            write_fd = f->set_gov;
+            check_gov = 1;
+#ifdef DEBUG_LIKWID
+            syslog(LOG_INFO, "CMD WRITE CPU %d FREQ_LOC_GOV %d", cpu, write_fd);
+#endif
+            break;
+        default:
+            syslog(LOG_ERR, "Invalid location specified in record\n");
+            break;
+    }
+    if (write_fd < 0)
+    {
+        syslog(LOG_ERR,"No such file: %s\n", strerror(errno));
+        rec->errorcode = FREQ_ERR_NOFILE;
+        return -1;
+    }
+    if ((check_freq && is_freq_valid(rec->datalen, rec->data)) ||
+        (check_gov && is_gov_valid(rec->datalen, rec->data)))
+    {
+        //syslog(LOG_INFO, "FD %d %.*s\n", write_fd, rec->datalen, rec->data);
+        int ret = write(write_fd, rec->data, rec->datalen);
+        if (ret < 0)
+        {
+            syslog(LOG_ERR,"No permission: %s\n", strerror(errno));
+            rec->errorcode = FREQ_ERR_NOPERM;
+            return -1;
+        }
+        //syslog(LOG_ERR,"All good\n");
+        rec->errorcode = FREQ_ERR_NONE;
+    }
+    else
+    {
+        rec->errorcode = FREQ_ERR_NOPERM;
+        return -1;
+    }
+    return 0;
+}
+
+
+/* #####  MAIN FUNCTION DEFINITION   ################## */
+
+int main(void)
+{
+    int ret;
+    pid_t pid;
+    FreqDataRecord dRecord;
+    struct sockaddr_un  addr1;
+    socklen_t socklen;
+    mode_t oldumask;
+    uint32_t numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+    
+
+    avail_cpus = get_avail_cpus();
+    if (avail_cpus < 0)
+    {
+        avail_cpus = numHWThreads;
+    }
+
+    openlog(ident, 0, LOG_USER);
+
+    if (!lock_check())
+    {
+        syslog(LOG_ERR,"Access to performance counters is locked.\n");
+        stop_daemon();
+    }
+
+    daemonize(&pid);
+#ifdef DEBUG_LIKWID
+    syslog(LOG_INFO, "FrequencyDaemon runs with UID %d, eUID %d\n", getuid(), geteuid());
+#endif
+
+
+    /* setup filename for socket */
+    filepath = (char*) calloc(sizeof(addr1.sun_path), 1);
+    snprintf(filepath, sizeof(addr1.sun_path), TOSTRING(LIKWIDSOCKETBASE) "-freq-%d", pid);
+
+    /* get a socket */
+    LOG_AND_EXIT_IF_ERROR(sockfd = socket(AF_LOCAL, SOCK_STREAM, 0), socket failed);
+
+    /* initialize socket data structure */
+    bzero(&addr1, sizeof(addr1));
+    addr1.sun_family = AF_LOCAL;
+    strncpy(addr1.sun_path, filepath, (sizeof(addr1.sun_path) - 1)); /* null terminated by the bzero() above! */
+
+    /* Change the file mode mask so only the calling user has access
+     * and switch the user/gid with which the following socket creation runs. */
+    oldumask = umask(077);
+    CHECK_ERROR(setfsuid(getuid()), setfsuid failed);
+
+    /* bind and listen on socket */
+    LOG_AND_EXIT_IF_ERROR(bind(sockfd, (SA*) &addr1, sizeof(addr1)), bind failed);
+    LOG_AND_EXIT_IF_ERROR(listen(sockfd, 1), listen failed);
+    LOG_AND_EXIT_IF_ERROR(chmod(filepath, S_IRUSR|S_IWUSR), chmod failed);
+
+    socklen = sizeof(addr1);
+
+    { /* Init signal handler */
+        struct sigaction sia;
+        sia.sa_handler = Signal_Handler;
+        sigemptyset(&sia.sa_mask);
+        sia.sa_flags = 0;
+        sigaction(SIGALRM, &sia, NULL);
+        sigaction(SIGPIPE, &sia, NULL);
+        sigaction(SIGTERM, &sia, NULL);
+    }
+    
+    /* setup an alarm to stop the daemon if there is no connect.*/
+    alarm(15U);
+    
+    if ((connfd = accept(sockfd, (SA*) &addr1, &socklen)) < 0)
+    {
+        if (errno == EINTR)
+        {
+            syslog(LOG_ERR, "exiting due to timeout - no client connected after 15 seconds.");
+        }
+        else
+        {
+            syslog(LOG_ERR, "accept() failed:  %s", strerror(errno));
+        }
+        CHECK_ERROR(unlink(filepath), unlink of socket failed);
+        exit(EXIT_FAILURE);
+    }
+    
+    alarm(0);
+    CHECK_ERROR(unlink(filepath), unlink of socket failed);
+
+    /* Restore the old umask and fs ids. */
+    (void) umask(oldumask);
+    CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
+
+    cpufiles = malloc(numHWThreads* sizeof(struct cpufreq_files));
+    if (!cpufiles)
+    {
+        syslog(LOG_ERR,"Failed to allocate space\n");
+        stop_daemon();
+    }
+    for (int i=0;i<avail_cpus;i++)
+    {
+        memset(&cpufiles[i], -1, sizeof(struct cpufreq_files));
+        //syslog(LOG_INFO,"Open files for CPU %d\n", i);
+        ret = open_cpu(i, &cpufiles[i]);
+        if (ret < 0)
+        {
+            syslog(LOG_ERR,"Failed to open files for CPU %d\n", i);
+            stop_daemon();
+        }
+    }
+
+
+LOOP:
+    //syslog(LOG_ERR, "Starting loop %d\n", avail_cpus);
+    while (1)
+    {
+        ret = read(connfd, (void*) &dRecord, sizeof(FreqDataRecord));
+
+        if (ret < 0)
+        {
+            syslog(LOG_ERR, "ERROR - Read returns %d", ret);
+            stop_daemon();
+        }
+        else if ((ret == 0) && (dRecord.type != FREQ_EXIT))
+        {
+            syslog(LOG_ERR, "ERROR - [%s:%d] zero read, remote socket closed before reading", __FILE__, __LINE__);
+            stop_daemon();
+        }
+
+        if (dRecord.type == FREQ_READ)
+        {
+            freq_read(&dRecord);
+        }
+        else if (dRecord.type == FREQ_WRITE)
+        {
+            freq_write(&dRecord);
+        }
+        else if (dRecord.type == FREQ_EXIT)
+        {
+            stop_daemon();
+        }
+        else
+        {
+            syslog(LOG_ERR, "unknown daemon command type %d", dRecord.type);
+            dRecord.errorcode = FREQ_ERR_UNKNOWN;
+        }
+
+        LOG_AND_EXIT_IF_ERROR(write(connfd, (void*) &dRecord, sizeof(FreqDataRecord)), write failed);
+    }
+
+    /* never reached */
+    return EXIT_SUCCESS;
+}
+
diff --git a/src/access-daemon/setFreq_cpufreq.c b/src/access-daemon/setFreq_cpufreq.c
deleted file mode 100644
index a31508131..000000000
--- a/src/access-daemon/setFreq_cpufreq.c
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  setFreq.c
- *
- *      Description:  Implementation of frequency daemon
- *
- *      Version:   <VERSION>
- *      Released:  <DATE>
- *
- *      Authors:  Jan Treibig (jt), jan.treibig@gmail.com,
- *                Thomas Roehl (tr), thomas.roehl@googlemail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#include <setFreq.h>
-
-#define AMD_TURBO_MSR 0xC0010015
-
-static char setfiles[3][100] = {"scaling_min_freq", "scaling_max_freq", "scaling_setspeed"};
-static char getfiles[3][100] = {"cpuinfo_min_freq", "cpuinfo_max_freq", "cpuinfo_cur_freq"};
-
-static char turbo_step[20];
-static char steps[30][20];
-static int num_steps = 0;
-
-static char governors[20][30];
-static int num_govs = 0;
-
-enum command {
-    MINIMUM = 0,
-    MAXIMUM = 1,
-    TURBO = 2,
-    GOVERNER
-};
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static int isTurbo(const int cpu_id)
-{
-    FILE *f = NULL;
-    char buff[256];
-    char *rptr = NULL, *sptr = NULL;
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    rptr = fgets(buff, 256, f);
-    if (strlen(turbo_step) > 0 && strncmp(turbo_step, rptr, strlen(turbo_step)) == 0)
-    {
-        return 1;
-    }
-    return 0;
-}
-
-static int isAMD()
-{
-    unsigned int eax,ebx,ecx,edx;
-    eax = 0x0;
-    CPUID(eax,ebx,ecx,edx);
-    if (ecx == 0x444d4163)
-        return 1;
-    return 0;
-}
-
-static int setAMDTurbo(const int cpu_id, const int turbo)
-{
-    int ret = 0;
-    int fd = 0;
-    unsigned long int data = 0x0ULL;
-    char buff[256];
-    sprintf(buff, "/dev/cpu/%d/msr", cpu_id);
-    fd = open(buff, O_RDWR);
-    ret = pread(fd, &data, sizeof(unsigned long int), AMD_TURBO_MSR);
-
-    if (turbo)
-    {
-        data &= ~(1ULL << 25);
-    }
-    else
-    {
-        data |= (1ULL << 25);
-    }
-    ret = pwrite(fd, &data, sizeof(unsigned long int), AMD_TURBO_MSR);
-    if (ret != sizeof(unsigned long int))
-        return EXIT_FAILURE;
-    return EXIT_SUCCESS;
-}
-
-
-static int getAvailFreq(const int cpu_id )
-{
-    int i, j, k;
-    FILE *f = NULL;
-    char buff[256];
-    char tmp[10];
-    char *rptr = NULL, *sptr = NULL;
-    unsigned int d = 0;
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_available_frequencies", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    rptr = fgets(buff, 256, f);
-    if (rptr != NULL)
-    {
-        sptr = strtok(buff, " ");
-        if (sptr != NULL)
-        {
-            d = strtoul(sptr, NULL, 10);
-            snprintf(turbo_step, 19, "%u", d);
-        }
-        while (sptr != NULL)
-        {
-            if (sptr != NULL)
-            {
-                d = strtoul(sptr, NULL, 10);
-                if (d == 0)
-                    break;
-                if (num_steps < 30)
-                {
-                    snprintf(steps[num_steps], 19, "%u", d);
-                    num_steps++;
-                }
-            }
-            sptr = strtok(NULL, " ");
-        }
-    }
-    fclose(f);
-    return num_steps;
-}
-
-static int getAvailGovs(const int cpu_id )
-{
-    int i, j, k;
-    FILE *f = NULL;
-    char cmd[256];
-    char buff[256];
-    char tmp[10];
-    char* eptr = NULL, *rptr = NULL;
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_available_governors", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    rptr = fgets(buff, 256, f);
-    if (rptr != NULL)
-    {
-        eptr = strtok(buff, " ");
-        snprintf(governors[num_govs], 19, "%s", eptr);
-        num_govs++;
-        while (eptr != NULL)
-        {
-            eptr = strtok(NULL, " ");
-            if (eptr != NULL && num_govs < 20 && strlen(eptr) > 1)
-            {
-                snprintf(governors[num_govs], 19, "%s", eptr);
-                num_govs++;
-            }
-        }
-    }
-/*    if (num_govs > 0 && strlen(turbo_step) > 0 && num_govs < 20)*/
-/*    {*/
-/*        snprintf(governors[num_govs], 19, "turbo");*/
-/*        num_govs++;*/
-/*    }*/
-    fclose(f);
-    return num_govs;
-}
-
-static void
-help(char *execname)
-{
-    int nsteps = num_steps, ngovs = num_govs;
-    int stepstart = 0;
-    fprintf(stderr, "Usage: %s <processorID> <cmd> <frequency|governor> \n",execname);
-    fprintf(stderr, "       Valid values for <cmd>:\n");
-    fprintf(stderr, "       - min: change minimum ratio limit of frequency\n");
-    fprintf(stderr, "       - max: change maximum ratio limit of frequency\n");
-    fprintf(stderr, "       - tur: Turn turbo \"on\" or \"off\"\n");
-    fprintf(stderr, "       - gov: change governor\n");
-    printf("Frequency steps: (Freqs. in kHz)\n");
-    if (num_steps == 0)
-        nsteps = getAvailFreq(0);
-
-    if ((!isTurbo(0)) && (!isAMD()))
-        stepstart = 1;
-    for (int s=nsteps-1; s>=stepstart; s--)
-        printf("%s ", steps[s]);
-    printf("\n");
-    printf("Governors:\n");
-    if (num_govs == 0)
-        ngovs = getAvailGovs(0);
-    for (int s=0; s<ngovs; s++)
-        printf("%s ", governors[s]);
-    printf("\n");
-    //printf("%s\n", eptr);
-}
-
-static int
-get_numCPUs()
-{
-    int cpucount = 0;
-    char line[1024];
-    FILE* fp = fopen("/proc/cpuinfo","r");
-    if (fp != NULL)
-    {
-        while( fgets(line, 1024, fp) )
-        {
-            if (strncmp(line, "processor", 9) == 0)
-            {
-                cpucount++;
-            }
-        }
-        fclose(fp);
-    }
-    return cpucount;
-}
-
-static unsigned int
-read_freq(char* fstr)
-{
-    unsigned int freq = strtoul(fstr, NULL, 10);
-    printf("%u\n", freq);
-    if (freq == 0)
-    {
-        fprintf(stderr, "Frequency must be greater than 0.\n");
-        exit(EXIT_FAILURE);
-    }
-    return freq;
-}
-
-static int
-valid_freq(unsigned long freq)
-{
-    FILE *f = NULL;
-    const char fname[] = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies";
-    char delimiter[] = " ";
-    char buff[1024];
-    char freqstr[25];
-    char *ptr = NULL, *eptr = NULL;
-    
-    snprintf(freqstr, 24, "%lu", freq);
-    f = fopen(fname, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Cannot open file %s for reading!\n", fname);
-        return 0;
-    }
-    eptr = fgets(buff, 1024, f);
-    if (eptr == NULL)
-    {
-        fprintf(stderr, "Cannot read content of file %s!\n", fname);
-        fclose(f);
-        return 0;
-    }
-    ptr = strtok(buff, delimiter);
-    while (ptr != NULL)
-    {
-        if (strncmp(ptr, freqstr, strlen(ptr)) == 0)
-        {
-            fclose(f);
-            return 1;
-        }
-        ptr = strtok(NULL, delimiter);
-    }
-    fclose(f);
-    return 0;
-}
-
-static int
-valid_gov(char* gov)
-{
-    FILE *f = NULL;
-    const char fname[] = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors";
-    char delimiter[] = " ";
-    char buff[1024];
-    char *ptr = NULL, *eptr = NULL;
-
-    f = fopen(fname, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Cannot open file %s for reading!\n", fname);
-        return 0;
-    }
-    eptr = fgets(buff, 1024, f);
-    if (eptr == NULL)
-    {
-        fprintf(stderr, "Cannot read content of file %s!\n", fname);
-        fclose(f);
-        return 0;
-    }
-    ptr = strtok(buff, delimiter);
-    while (ptr != NULL)
-    {
-        if (strncmp(ptr, gov, strlen(ptr)) == 0)
-        {
-            fclose(f);
-            return 1;
-        }
-        ptr = strtok(NULL, delimiter);
-    }
-    fclose(f);
-    return 0;
-}
-
-/* #####  MAIN FUNCTION DEFINITION   ################## */
-
-int
-do_cpufreq (int argn, char** argv)
-{
-    int i = 0;
-    int cpuid = 0;
-    int set_id = -1;
-    unsigned int freq = 0;
-    int turbo = -1;
-    int numCPUs = 0;
-    enum command cmd;
-    char* gov = NULL;
-    char* fpath = NULL;
-    FILE* f = NULL;
-    int num_steps = 0, num_govs = 0;
-
-    if (argn < 3 || argn > 4)
-    {
-        help(argv[0]);
-        exit(EXIT_FAILURE);
-    }
-
-    /* Check for valid CPU */
-    cpuid = atoi(argv[1]);
-    num_steps = getAvailFreq(cpuid);
-    num_govs = getAvailGovs(cpuid);
-    numCPUs = get_numCPUs();
-    if (cpuid < 0 || cpuid > numCPUs)
-    {
-        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n", cpuid, numCPUs);
-        exit(EXIT_FAILURE);
-    }
-
-    /* Read in command and argument */
-    if (strncmp(argv[2], "tur", 3) == 0)
-    {
-        cmd = TURBO;
-        if (strncmp(argv[3], "0", 1) != 0 && strncmp(argv[3], "1", 1) != 0)
-        {
-            fprintf(stderr, "Invalid turbo setting %s! Only 0 (off) and 1 (on) allowed\n\n",argv[3]);
-            help(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-        turbo = atoi(argv[3]);
-        if (turbo < 0 || turbo > 1)
-        {
-            fprintf(stderr, "Invalid turbo setting %d! Only 0 (off) and 1(on) allowed\n\n",turbo);
-            help(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-    }
-    else if (strncmp(argv[2], "min", 3) == 0)
-    {
-        cmd = MINIMUM;
-        freq = read_freq(argv[3]);
-        if (!valid_freq(freq))
-        {
-            fprintf(stderr, "Invalid frequency %lu!\n",freq);
-            if (freq == read_freq(turbo_step))
-            {
-                fprintf(stderr, "In order to set the turbo frequency, use tur(bo) option\n");
-            }
-            fprintf(stderr, "\n\n");
-            help(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-    }
-    else if (strncmp(argv[2], "max", 3) == 0)
-    {
-        cmd = MAXIMUM;
-        freq = read_freq(argv[3]);
-        if (!valid_freq(freq))
-        {
-            fprintf(stderr, "Invalid frequency %lu!\n",freq);
-            if (freq == read_freq(turbo_step))
-            {
-                fprintf(stderr, "In order to set the turbo frequency, use tur(bo) option\n");
-            }
-            fprintf(stderr, "\n\n");
-            help(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-    }
-    else if (strncmp(argv[2], "gov", 3) == 0)
-    {
-        cmd = GOVERNER;
-        gov = argv[3];
-        /* Only allow specific governors */
-        if (!valid_gov(gov))
-        {
-            fprintf(stderr, "Invalid governor %s!\n\n",gov);
-            help(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-    }
-    else
-    {
-        fprintf(stderr, "Unknown command %s!\n\n", argv[2]);
-        help(argv[0]);
-        exit(EXIT_FAILURE);
-    }
-
-    fpath = malloc(100 * sizeof(char));
-    if (!fpath)
-    {
-        fprintf(stderr, "Unable to allocate space!\n\n");
-        exit(EXIT_FAILURE);
-    }
-
-    /* If the current frequency should be set we have to make sure that the governor is
-     * 'userspace'. Minimal and maximal frequency are possible for other governors but
-     * they dynamically adjust the current clock speed.
-     */
-    if (cmd == MINIMUM || cmd == MAXIMUM)
-    {
-        int tmp = 0;
-        char testgov[1024];
-        snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
-        f = fopen(fpath, "r");
-        if (f == NULL) {
-            fprintf(stderr, "Unable to open path %s for reading\n",fpath);
-            free(fpath);
-            return (EXIT_FAILURE);
-        }
-        tmp = fread(testgov, 100, sizeof(char), f);
-/*        if (strncmp(testgov, "userspace", 9) != 0)*/
-/*        {*/
-/*            fclose(f);*/
-/*            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);*/
-/*            f = fopen(fpath, "w");*/
-/*            if (f == NULL) {*/
-/*                fprintf(stderr, "Unable to open path %s for writing\n", fpath);*/
-/*                free(fpath);*/
-/*                return (EXIT_FAILURE);*/
-/*            }*/
-/*            fprintf(f,"userspace");*/
-/*        }*/
-        fclose(f);
-    }
-
-    switch(cmd)
-    {
-        //case SET_CURRENT:
-        case MINIMUM:
-        case MAXIMUM:
-            /* The cmd is also used as index in the setfiles array */
-            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[cmd]);
-            f = fopen(fpath, "w");
-            if (f == NULL) {
-                fprintf(stderr, "Unable to open path %s for writing\n",fpath);
-                free(fpath);
-                return (EXIT_FAILURE);
-            }
-            fprintf(f,"%u",freq);
-            fclose(f);
-            break;
-        case TURBO:
-            if (!isAMD())
-            {
-                if (turbo == 0)
-                {
-                    double fr = 0;
-                    snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[MAXIMUM]);
-                    f = fopen(fpath, "w");
-                    if (f == NULL) {
-                        fprintf(stderr, "Unable to open path %s for writing\n",fpath);
-                        free(fpath);
-                        return (EXIT_FAILURE);
-                    }
-                    fprintf(f,"%s",steps[1]);
-                    fclose(f);
-                }
-                else
-                {
-                    snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/%s", cpuid, setfiles[MAXIMUM]);
-                    f = fopen(fpath, "w");
-                    if (f == NULL) {
-                        fprintf(stderr, "Unable to open path %s for writing\n",fpath);
-                        free(fpath);
-                        return (EXIT_FAILURE);
-                    }
-                    fprintf(f,"%s", turbo_step);
-                    fclose(f);
-                }
-            }
-            else
-            {
-                return setAMDTurbo(cpuid, turbo);
-            }
-            break;
-        case GOVERNER:
-            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
-            f = fopen(fpath, "w");
-            if (f == NULL) {
-                fprintf(stderr, "Unable to open path %s for writing\n", fpath);
-                free(fpath);
-                return (EXIT_FAILURE);
-            }
-            fprintf(f,"%s",gov);
-            fclose(f);
-            break;
-    }
-    
-    free(fpath);
-    return EXIT_SUCCESS;
-
-}
-
diff --git a/src/access-daemon/setFreq_pstate.c b/src/access-daemon/setFreq_pstate.c
deleted file mode 100644
index f9078df27..000000000
--- a/src/access-daemon/setFreq_pstate.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  setFreq.c
- *
- *      Description:  Implementation of frequency daemon
- *
- *      Version:   <VERSION>
- *      Released:  <DATE>
- *
- *      Authors:  Thomas Roehl (tr), thomas.roehl@googlemail.com
- *                Amin Nabikhani, amin.nabikhani@gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <dirent.h>
-#include <errno.h>
-#include <setFreq.h>
-
-static char setfiles[3][100] = {"min_perf_pct", "max_perf_pct","no_turbo"};
-static char getfiles[3][100] = {"cpuinfo_min_freq", "cpuinfo_max_freq", "cpuinfo_cur_freq"};
-static char governers[20][100];
-static unsigned int freqs[100];
-static unsigned int percent[100];
-
-enum command {
-    MINIMUM = 0,
-    MAXIMUM = 1,
-    TURBO = 2,
-    GOVERNOR
-};
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static void help(char *execname)
-{
-    fprintf(stderr, "Usage: %s <processorID> <cmd> <frequency|governor> \n",execname);
-    fprintf(stderr, "       Valid values for <cmd>:\n");
-    fprintf(stderr, "       - min: change minimum ratio limit of frequency\n");
-    fprintf(stderr, "       - max: change maximum ratio limit of frequency\n");
-    fprintf(stderr, "       - tur: Turn turbo \"on\" or \"off\"\n");
-    fprintf(stderr, "       - gov: change governor\n");
-}
-
-static int check_driver()
-{
-    int ret = 1;
-    DIR* dir = opendir("/sys/devices/system/cpu/intel_pstate");
-    if (ENOENT == errno)
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  intel_pstate is not present!\n");
-        ret = 0;
-    }
-
-    closedir(dir);
-    return ret;
-}
-
-static unsigned int getMax()
-{
-    char line[1024];
-    unsigned int maxFreq = 0;
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/cpufreq/policy0/cpuinfo_max_freq", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        maxFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return maxFreq;
-}
-
-static unsigned int getCurMax()
-{
-    char line[1024];
-    unsigned int maxFreq = 0;
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/max_perf_pct", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        maxFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return maxFreq;
-}
-
-
-static unsigned int getMin()
-{
-    char line[1024];
-    unsigned int minFreq = 0;
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/cpufreq/policy0/cpuinfo_min_freq", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        minFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return minFreq;
-}
-
-static unsigned int getCurMin()
-{
-    char line[1024];
-    unsigned int minFreq = 0;
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/min_perf_pct", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        minFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return minFreq;
-}
-
-static unsigned int turbo_pct()
-{
-    char readval[4];
-    unsigned int turbo_pct;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/turbo_pct","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 4, fp) )
-        {
-            turbo_pct = strtoul(readval,NULL,10);
-        }
-        fclose(fp);
-    }
-    return turbo_pct;
-}
-
-static unsigned int num_pstates()
-{
-    char readval[4];
-    unsigned int num;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/num_pstates","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 4, fp) )
-        {
-            num = strtoul(readval,NULL,10);
-        }
-        fclose(fp);
-    }
-    else
-    {
-        exit(1);
-    }
-    return num;
-}
-
-static int mode()
-{
-    char readval[5];
-    char tmode;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/no_turbo","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 5, fp) )
-        {
-            tmode = atoi(readval);
-        }
-        fclose(fp);
-    }
-    return tmode;
-}
-
-
-static int getGov()
-{
-    FILE *f = NULL;
-    const char fname[] = "/sys/devices/system/cpu/cpufreq/policy0/scaling_available_governors";
-    char delimiter[] = " ";
-    char buff[1024];
-    char *ptr = NULL, *eptr = NULL;
-    unsigned int count = 0;
-
-    f = fopen(fname, "r");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Cannot open file %s for reading!\n", fname);
-        return 0;
-    }
-    eptr = fgets(buff, 1024, f);
-
-    if (eptr == NULL)
-    {
-        fprintf(stderr, "Cannot read content of file %s!\n", fname);
-        fclose(f);
-        return 0;
-    }
-    ptr = strtok(buff, delimiter);
-    while (ptr != NULL)
-    {
-        strcpy(governers[count],ptr);
-        ptr = strtok(NULL, delimiter);
-        ptr = strtok(ptr, "\n");
-        count= count + 1;
-    }
-    fclose(f);
-    return 0;
-}
-
-static void steps()
-{
-    unsigned int minFreq = getMin();
-    unsigned int trb = turbo_pct();
-    unsigned int maxFreq = getMax();
-    unsigned int step = num_pstates();
-    int range = 0;
-
-    if(maxFreq != 0)
-    {
-        int t = mode();
-        if (t != 0)
-        {
-            maxFreq = getMax()/(1+0.01*trb);
-        }
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-    if(step != 0)
-    {
-        range = (maxFreq-minFreq)/step;
-        freqs[0] = minFreq;
-        freqs[step-1]= maxFreq;
-        percent[0] = (minFreq/(float)maxFreq) * 100;
-        percent[step-1] = 100;
-
-        for(size_t i=1; i < step-1; i++)
-        {
-            freqs[i] = minFreq+ i* range;
-            percent[i] = (freqs[i]/(float)maxFreq) * 100;
-        }
-    }
-    else
-    {
-        fprintf(stderr,"\tEXIT WITH ERROR:  # of pstates could not be read");
-    }
-}
-
-static void throw(char* arg)
-{
-    unsigned int step = num_pstates();
-    unsigned int count = 0;
-    help(arg);
-    printf("Frequency steps: (Freqs. in kHz)\n");
-    for(unsigned int i=0; i < step; i++)
-    {
-        //printf("\t%.1f\t%u %s\n",1E-6*((double)freqs[i]),percent[i],"%");
-        unsigned int t = (freqs[i]/10000)*10000;
-        
-        printf("%lu ", t);
-    }
-    printf("\n");
-    printf("Governors:\n");
-    while (strcmp(governers[count],"") != 0)
-    {
-        printf("%s ",governers[count]);
-        count+=1;
-    }
-    printf("\n");
-}
-
-static int valid_gov(char* gov)
-{
-    unsigned int count = 0;
-    while (strcmp(governers[count],"") != 0)
-    {
-        if (strncmp(governers[count], gov, strlen(governers[count])) == 0)
-        {
-            return 1;
-        }
-        count = count + 1;
-    }
-    return 0;
-}
-
-static int valid_freq(char* freq)
-{
-    int idx = -1;
-    int ret = 0;
-    unsigned int step = num_pstates();
-    char fstep[20];
-    unsigned int f = (unsigned int)(atof(freq)*1000000);
-    for (int s=0;s<step;s++)
-    {
-        if ((freqs[s] >= f-10000) && ((freqs[s] <= f+10000)))
-        {
-            idx = s;
-            break;
-        }
-/*        memset(fstep, 0, 20*sizeof(char));*/
-/*        ret = sprintf(fstep, "%.1f", 1E-6*((double)freqs[s]));*/
-/*        fstep[ret] = '\0';*/
-/*        if (strcmp(fstep, freq) == 0)*/
-/*        {*/
-/*            idx = s;*/
-/*            break;*/
-/*        }*/
-    }
-    return idx;
-}
-
-
-int
-do_pstate (int argn, char** argv)
-{
-    check_driver();
-    steps();
-    getGov();
-    unsigned int step = num_pstates();
-    unsigned int minFreq = freqs[0];
-    unsigned int maxFreq = freqs[step-1];
-    int frq_pct = -1;
-    int idx = -1;
-    char* gov = NULL;
-    char* freq = NULL;
-    FILE *f = NULL;
-    enum command key;
-    char* fpath = NULL;
-
-    if (argn != 4)
-    {
-        throw(argv[0]);
-        exit(EXIT_FAILURE);
-    }
-    freq = argv[3];
-
-    if (strncmp(argv[2], "min", 3) == 0)
-    {
-        key = MINIMUM;
-        idx = valid_freq(freq);
-        if (idx < 0)
-        {
-            fprintf(stderr, "Invalid frequency %s!\n\n",freq);
-            throw(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-        frq_pct = percent[idx];
-    }
-    else if (strncmp(argv[2], "max", 3) == 0)
-    {
-        key = MAXIMUM;
-        idx = valid_freq(freq);
-
-        if (idx < 0)
-        {
-            fprintf(stderr, "Invalid frequency %s!\n\n",freq);
-            throw(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-        frq_pct = percent[idx];
-    }
-
-    else if (strncmp(argv[2], "gov", 3) == 0)
-    {
-        key = GOVERNOR;
-        gov = argv[3];
-        if (!valid_gov(gov))
-        {
-            fprintf(stderr, "Invalid governor %s!\n\n",gov);
-            throw(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-    }
-    else if (strncmp(argv[2], "tur", 3) == 0)
-    {
-        key = TURBO;
-        frq_pct = atoi(argv[3]);
-        if (frq_pct != 0 && frq_pct != 1)
-        {
-            fprintf(stderr, "Invalid value for trubo mode: \"%u\"!, the value must be either 0 or 1 \n\n",frq_pct);
-            throw(argv[0]);
-            exit(EXIT_FAILURE);
-        }
-        frq_pct = (frq_pct == 1 ? 0 : 1);
-    }
-    else
-    {
-        fprintf(stderr, "Unknown command %s!\n\n", argv[1]);
-        throw(argv[0]);
-        exit(EXIT_FAILURE);
-    }
-
-    fpath = malloc(100 * sizeof(char));
-    if (!fpath)
-    {
-        fprintf(stderr, "Unable to allocate space!\n\n");
-        exit(EXIT_FAILURE);
-    }
-
-redo:
-    switch(key)
-    {
-        case MINIMUM:
-        case MAXIMUM:
-            snprintf(fpath, 99, "/sys/devices/system/cpu/intel_pstate/%s", setfiles[key]);
-            printf("File %s\n", fpath);
-            f = fopen(fpath, "w+");
-            if (f == NULL) {
-                fprintf(stderr, "Unable to open path \"%s\" for writing\n",fpath);
-                free(fpath);
-                return (EXIT_FAILURE);
-            }
-            printf("Write percentage %lu\n", frq_pct);
-            fprintf(f,"%u",frq_pct);
-            fclose(f);
-            break;
-
-        case GOVERNOR:
-            snprintf(fpath, 99, "/sys/devices/system/cpu/cpu%s/cpufreq/scaling_governor", argv[1]);
-            unsigned int bturbo = mode();
-            f = fopen(fpath, "w");
-            if (f == NULL) {
-                fprintf(stderr, "Unable to open path \"%s\" for writing\n", fpath);
-                free(fpath);
-                return (EXIT_FAILURE);
-            }
-            fprintf(f,"%s",gov);
-            fclose(f);
-            unsigned int aturbo = mode();
-            if (bturbo != aturbo)
-            {
-                frq_pct = bturbo;
-                key = TURBO;
-                goto redo;
-            }
-            break;
-
-        case TURBO:
-            snprintf(fpath, 99, "/sys/devices/system/cpu/intel_pstate/%s", setfiles[key]);
-            f = fopen(fpath, "w+");
-            if (f == NULL) {
-                fprintf(stderr, "Unable to open path \"%s\" for writing\n",fpath);
-                free(fpath);
-                return (EXIT_FAILURE);
-            }
-            fprintf(f,"%u",frq_pct);
-            fclose(f);
-            break;
-    }
-
-    return 0;
-}
-
diff --git a/src/affinity.c b/src/affinity.c
index bd377a5f8..5f4811ee6 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -229,7 +229,7 @@ static int create_lookups()
         }
         affinity_thread2numa_lookup[hwthreadid] = memid;
         DEBUG_PRINT(DEBUGLEV_DEVELOP, affinity_thread2numa_lookup[%d] = %d, hwthreadid, memid);
-        if (do_cache)
+        if (do_cache && cachelimit > 0)
         {
             if (pu_idx % cachelimit == 0)
             {
@@ -243,41 +243,6 @@ static int create_lookups()
     return 0;
 }
 
-/*static int create_locks()*/
-/*{*/
-/*    numa_init();*/
-/*    if (!socket_lock)*/
-/*    {*/
-/*        socket_lock = malloc(cpuid_topology.numSockets * sizeof(int));*/
-/*        memset(socket_lock, LOCK_INIT, cpuid_topology.numSockets*sizeof(int));*/
-/*    }*/
-/*    if (!tile_lock)*/
-/*    {*/
-/*        tile_lock = malloc(cpuid_topology.numHWThreads * sizeof(int));*/
-/*        memset(tile_lock, LOCK_INIT, cpuid_topology.numHWThreads*sizeof(int));*/
-/*    }*/
-/*    if (!numa_lock)*/
-/*    {*/
-/*        numa_lock = malloc(numa_info.numberOfNodes * sizeof(int));*/
-/*        memset(numa_lock, LOCK_INIT, numa_info.numberOfNodes*sizeof(int));*/
-/*    }*/
-/*    if (!core_lock)*/
-/*    {*/
-/*        int cores = (cpuid_topology.numHWThreads/cpuid_topology.numThreadsPerCore);*/
-/*        core_lock = malloc(cores * sizeof(int));*/
-/*        memset(core_lock, LOCK_INIT, cores*sizeof(int));*/
-/*    }*/
-/*    if (!sharedl2_lock)*/
-/*    {*/
-/*        sharedl2_lock = malloc(cpuid_topology.numHWThreads * sizeof(int));*/
-/*        memset(sharedl2_lock, LOCK_INIT, cpuid_topology.numHWThreads*sizeof(int));*/
-/*    }*/
-/*    if (!sharedl3_lock)*/
-/*    {*/
-/*        sharedl3_lock = malloc(cpuid_topology.numHWThreads * sizeof(int));*/
-/*        memset(sharedl3_lock, LOCK_INIT, cpuid_topology.numHWThreads*sizeof(int));*/
-/*    }*/
-/*}*/
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
@@ -294,14 +259,19 @@ affinity_init()
         return;
     }
     topology_init();
+    numa_init();
 
-    create_lookups();
     //create_locks();
     int numberOfSocketDomains = cpuid_topology.numSockets;
 
     DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Socket domains %d, numberOfSocketDomains);
-    numa_init();
+
     int numberOfNumaDomains = numa_info.numberOfNodes;
+    for (int i = numberOfNumaDomains-1; i >= 0; i--)
+    {
+        if (numa_info.nodes[i].numberOfProcessors == 0)
+	    numberOfNumaDomains--;
+    }
     DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: NUMA domains %d, numberOfNumaDomains);
     int numberOfProcessorsPerSocket =
         cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore;
@@ -521,6 +491,9 @@ affinity_init()
     affinityDomains.numberOfCoresPerCache = numberOfCoresPerCache;
     affinityDomains.numberOfProcessorsPerCache = numberOfProcessorsPerCache;
     affinityDomains.domains = domains;
+
+    create_lookups();
+
     affinity_initialized = 1;
 }
 
@@ -570,18 +543,6 @@ affinity_finalize()
         free(affinity_thread2numa_lookup);
         affinity_thread2numa_lookup = NULL;
     }
-/*    if (socket_lock)*/
-/*        free(socket_lock);*/
-/*    if (core_lock)*/
-/*        free(core_lock);*/
-/*    if (tile_lock)*/
-/*        free(tile_lock);*/
-/*    if (numa_lock)*/
-/*        free(numa_lock);*/
-/*    if (sharedl2_lock)*/
-/*        free(sharedl2_lock);*/
-/*    if (sharedl3_lock)*/
-/*        free(sharedl3_lock);*/
 
     affinityDomains.domains = NULL;
     affinity_numberOfDomains = 0;
diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua
index 48f02df87..903d0a071 100644
--- a/src/applications/likwid-mpirun.lua
+++ b/src/applications/likwid-mpirun.lua
@@ -70,8 +70,10 @@ local function usage()
     print_stdout("-n/-np <count>\t\t Set the number of processes")
     print_stdout("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
     print_stdout("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
+    print_stdout("-t/-tpp <count>\t\t Set the number of threads per MPI process")
+    print_stdout("--dist <d>(:order)\t Specify the CPU distance between MPI processes. Possible orders are close and spread.")
     print_stdout("-s, --skip <hex>\t Bitmask with threads to skip")
-    print_stdout("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
+    print_stdout("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi, mvapich2 or slurm")
     print_stdout("\t\t\t If not set, module system is checked")
     print_stdout("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
     print_stdout("\t\t\t Only required for statically linked executables.")
@@ -79,8 +81,10 @@ local function usage()
     print_stdout("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
     print_stdout("-m/-marker\t\t Activate marker API mode")
     print_stdout("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+    print_stdout("-o/--output <file>\tWrite output to a file. The file is reformatted according to the suffix.")
     print_stdout("-f\t\t\t Force overwrite of registers if they are in use. You can also use environment variable LIKWID_FORCE")
     print_stdout("-e, --env <key>=<value>\t Set environment variables for MPI processes")
+    print_stdout("--mpiopts <str>\t Hand over options to underlying MPI. Please use proper quoting.")
     print_stdout("")
     print_stdout("Processes are pinned to physical CPU cores first. For syntax questions see likwid-pin")
     print_stdout("")
@@ -93,7 +97,10 @@ end
 
 local np = 0
 local ppn = 0
+local dist = 1
 local tpp = 1
+local tpp_orderings = {"close", "spread"}
+local tpp_ordering = "spread"
 local nperdomain = nil
 local npernode = 0
 local cpuexprs = {}
@@ -106,11 +113,12 @@ local omptype = nil
 local skipStr = ""
 local executable = {}
 local envsettings = {}
-local mpiopts = {}
+local mpiopts = nil
 local debug = false
 local likwiddebug = false
 local use_marker = false
 local use_csv = false
+local outfile = nil
 local force = false
 local print_stats = false
 if os.getenv("LIKWID_FORCE") ~= nil then
@@ -241,7 +249,7 @@ local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes)
 
     local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s %s",
                                 mpiexecutable, hostfile, bindstr,
-                                np, ppn, table.concat(mpiopts, ' '), wrapperscript)
+                                np, ppn, mpiopts, wrapperscript)
     if debug then
         print_stdout("EXEC: "..cmd)
     end
@@ -348,8 +356,9 @@ local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
             envstr = envstr .. string.format("-env %s %s ", i, e)
         end
     end
-    for i,e in pairs(mpiopts) do
-        envstr = envstr .. string.format("%s ",e)
+
+    if mpiopts and mpiopts:len() > 0 then
+        envstr = envstr .. mpiopts
     end
     if os.getenv("LIKWID_MPI_CONNECT") ~= nil then
         mpi_connect = os.getenv("LIKWID_MPI_CONNECT")
@@ -466,7 +475,7 @@ local function executeMvapich2(wrapperscript, hostfile, env, nrNodes)
 
     local cmd = string.format("%s -f %s -np %d -ppn %d %s %s %s",
                                 mpiexecutable, hostfile,
-                                np, ppn, envstr, table.concat(mpiopts, ' '), wrapperscript)
+                                np, ppn, envstr, mpiopts, wrapperscript)
     if debug then
         print_stdout("EXEC: "..cmd)
     end
@@ -521,6 +530,15 @@ end
 
 local function readHostfileSlurm(hostlist)
     nperhost = tonumber(os.getenv("SLURM_TASKS_PER_NODE"):match("(%d+)"))
+    if force then
+        if os.getenv("SLURM_CPUS_ON_NODE") ~= nil then
+            nperhost = tonumber(os.getenv("SLURM_CPUS_ON_NODE"):match("(%d+)")) / nperhost
+        elseif os.getenv("SLURM_CPUS_PER_TASK") ~= nil then
+            nperhost = tonumber(os.getenv("SLURM_CPUS_PER_TASK"):match("(%d+)"))
+        else
+            nperhost = cpuCount() / nperhost
+        end
+    end
     if hostlist and nperhost then
         hostfile = write_hostlist_to_file(hostlist, nperhost)
         hosts = readHostfilePBS(hostfile)
@@ -591,7 +609,7 @@ local function executeSlurm(wrapperscript, hostfile, env, nrNodes)
         wrapperscript = os.getenv("PWD").."/"..wrapperscript
     end
     local exec = string.format("srun -N %d --ntasks-per-node=%d --cpu_bind=none %s %s",
-                                nrNodes, ppn, table.concat(mpiopts, ' '), wrapperscript)
+                                nrNodes, ppn, mpiopts, wrapperscript)
     if debug then
         print_stdout("EXEC: "..exec)
     end
@@ -875,6 +893,9 @@ local function assignHosts(hosts, np, ppn, tpp)
                     current = ppn
                 end]]
                 print_stderr(string.format("ERROR: Oversubscription required. Host %s has only %s slots but %d needed per host", host["hostname"], host["slots"], ppn))
+                if mpitype == "slurm" then
+                    print_stderr("In SLURM environments, it might be a problem with --ntasks (the slots) and --cpus-per-task options")
+                end
                 os.exit(1)
             else
                 table.insert(newhosts, {hostname=host["hostname"],
@@ -972,9 +993,15 @@ local function calculateCpuExprs(nperdomain, cpuexprs)
 
     for i, domidx in pairs(domainlist) do
         local sortedlist = {}
-        for off=1,topo["numThreadsPerCore"] do
-            for i=0,affinity["domains"][domidx]["numberOfProcessors"]/topo["numThreadsPerCore"] do
-                table.insert(sortedlist, affinity["domains"][domidx]["processorList"][off + (i*topo["numThreadsPerCore"])])
+        if tpp_ordering == "spread" then
+            for off=1,topo["numThreadsPerCore"] do
+                for i=0,affinity["domains"][domidx]["numberOfProcessors"]/topo["numThreadsPerCore"] do
+                    table.insert(sortedlist, affinity["domains"][domidx]["processorList"][off + (i*topo["numThreadsPerCore"])])
+                end
+            end
+        elseif tpp_ordering == "close" then
+            for i=0,affinity["domains"][domidx]["numberOfProcessors"] do
+                table.insert(sortedlist, affinity["domains"][domidx]["processorList"][i])
             end
         end
         local tmplist = {}
@@ -985,6 +1012,9 @@ local function calculateCpuExprs(nperdomain, cpuexprs)
                 table.remove(sortedlist, 1)
             end
             table.insert(newexprs, tmplist)
+            if dist > threads then
+                for t=1, dist-threads do table.remove(sortedlist, 1) end
+            end
         end
     end
     if debug then
@@ -1016,24 +1046,47 @@ end
 
 local function splitUncoreEvents(groupdata)
     local core = {}
-    local uncore = {}
+    local socket = {}
+    local numa = {}
+    local llc = {}
+    local cpuinfo = likwid.getCpuInfo()
+
     for i, e in pairs(groupdata["Events"]) do
         if  not e["Counter"]:match("FIXC%d") and
             not e["Counter"]:match("^PMC%d") and
             not e["Counter"]:match("TMP%d") then
             local event = e["Event"]..":"..e["Counter"]
-            table.insert(uncore, event)
+            if cpuinfo["architecture"] == "x86_64" and cpuinfo["isIntel"] == 1 then
+                table.insert(socket, event)
+            elseif cpuinfo["architecture"] == "x86_64" then
+                if e["Counter"]:match("^CPMC%d") then
+                    table.insert(llc, event)
+                elseif e["Counter"]:match("^DFC%d") then
+                    table.insert(numa, event)
+                end
+            elseif cpuinfo["architecture"] == "armv8" then
+                table.insert(socket, event)
+            elseif cpuinfo["architecture"] == "armv7" then
+                table.insert(socket, event)
+            end
         else
             local event = e["Event"]..":"..e["Counter"]
             table.insert(core, event)
         end
     end
-    cevents = table.concat(core, ",")
-    uevents = table.concat(core, ",")
-    if #uncore > 0 then
-        uevents = uevents..","..table.concat(uncore,",")
+    sevents = ""
+    nevents = ""
+    levents = ""
+    if #socket > 0 then
+        sevents = table.concat(socket,",")
+    end
+    if #numa > 0 then
+        nevents = table.concat(numa, ",")
+    end
+    if #llc > 0 then
+        levents = table.concat(llc, ",")
     end
-    return cevents, uevents
+    return table.concat(core, ","), sevents, nevents, levents
 end
 
 local function inList(value, list)
@@ -1062,13 +1115,17 @@ local function uniqueList(list)
 end
 
 local function setPerfStrings(perflist, cpuexprs)
-    local uncore = false
+    local suncore = false
     local perfexprs = {}
     local grouplist = {}
-    local cpuinfo = likwid.getCpuInfo()
+    
     local affinity = likwid.getAffinityInfo()
     local socketList = {}
     local socketListFlags = {}
+    local numaList = {}
+    local numaListFlags = {}
+    local llcList = {}
+    local llcListFlags = {}
     for i, d in pairs(affinity["domains"]) do
         if d["tag"]:match("S%d+") then
             local tmpList = {}
@@ -1078,6 +1135,22 @@ local function setPerfStrings(perflist, cpuexprs)
             table.insert(socketList, tmpList)
             table.insert(socketListFlags, 1)
         end
+        if d["tag"]:match("M%d+") then
+            local tmpList = {}
+            for j,cpu in pairs(d["processorList"]) do
+                table.insert(tmpList, cpu)
+            end
+            table.insert(numaList, tmpList)
+            table.insert(numaListFlags, 1)
+        end
+        if d["tag"]:match("C%d+") then
+            local tmpList = {}
+            for j,cpu in pairs(d["processorList"]) do
+                table.insert(tmpList, cpu)
+            end
+            table.insert(llcList, tmpList)
+            table.insert(llcListFlags, 1)
+        end
     end
 
     for k, perfStr in pairs(perflist) do
@@ -1092,13 +1165,23 @@ local function setPerfStrings(perflist, cpuexprs)
             end
 
             local coreevents = ""
-            local uncoreevents = ""
-            coreevents, uncoreevents = splitUncoreEvents(gdata)
+            local socketevents = ""
+            local numaevents = ""
+            local llcevents = ""
+            coreevents, socketevents, numaevents, llcevents = splitUncoreEvents(gdata)
 
             local tmpSocketFlags = {}
             for _,e in pairs(socketListFlags) do
                 table.insert(tmpSocketFlags, e)
             end
+            local tmpNumaFlags = {}
+            for _,e in pairs(numaListFlags) do
+                table.insert(tmpNumaFlags, e)
+            end
+            local tmpCacheFlags = {}
+            for _,e in pairs(llcListFlags) do
+                table.insert(tmpCacheFlags, e)
+            end
 
             for i,cpuexpr in pairs(cpuexprs) do
                 local slist = {}
@@ -1110,19 +1193,57 @@ local function setPerfStrings(perflist, cpuexprs)
                     end
                 end
                 slist = uniqueList(slist)
-                local uncore = false
+                local mlist = {}
+                for j, cpu in pairs(cpuexpr) do
+                    for l, numalist in pairs(numaList) do
+                        if inList(tonumber(cpu), numalist) then
+                            table.insert(mlist, l)
+                        end
+                    end
+                end
+                mlist = uniqueList(mlist)
+                local clist = {}
+                for j, cpu in pairs(cpuexpr) do
+                    for l, llclist in pairs(llcList) do
+                        if inList(tonumber(cpu), llclist) then
+                            table.insert(clist, l)
+                        end
+                    end
+                end
+                clist = uniqueList(clist)
+                local suncore = false
+                local muncore = false
+                local cuncore = false
                 for _, s in pairs(slist) do
                     if tmpSocketFlags[s] == 1 then
                         tmpSocketFlags[s] = 0
-                        uncore = true
+                        suncore = true
+                    end
+                end
+                for _, s in pairs(mlist) do
+                    if tmpNumaFlags[s] == 1 then
+                        tmpNumaFlags[s] = 0
+                        muncore = true
+                    end
+                end
+                for _, s in pairs(clist) do
+                    if tmpCacheFlags[s] == 1 then
+                        tmpCacheFlags[s] = 0
+                        cuncore = true
                     end
                 end
                 if perfexprs[k][i] == nil then
-                    if uncore then
-                        perfexprs[k][i] = uncoreevents
-                    else
-                        perfexprs[k][i] = coreevents
+                    local elist = {coreevents}
+                    if cuncore and llcevents:len() > 0 then
+                        table.insert(elist, llcevents)
+                    end
+                    if muncore and numaevents:len() > 0 then
+                        table.insert(elist, numaevents)
+                    end
+                    if suncore and socketevents:len() > 0 then
+                        table.insert(elist, socketevents)
                     end
+                    perfexprs[k][i] = table.concat(elist, ",")
                 end
             end
 
@@ -1187,9 +1308,9 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu
         glsize_var = tostring(math.tointeger(np))
         losize_var = tostring(math.tointeger(ppn))
     elseif mpitype == "slurm" then
-        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glrank_var = "${SLURM_PROCID:-$(($GLOBALSIZE * 2))}"
         glsize_var = tostring(math.tointeger(np))
-        losize_var = "${MPI_LOCALNRANKS:-$SLURM_NTASKS_PER_NODE}"
+        losize_var = string.format("${SLURM_NTASKS_PER_NODE:-%d}", math.tointeger(ppn))
     else
         print_stderr("Invalid MPI vendor "..mpitype)
         return
@@ -1251,7 +1372,7 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu
     f:write("GLOBALSIZE="..glsize_var.."\n")
     f:write("GLOBALRANK="..glrank_var.."\n")
     if os.getenv("OMP_NUM_THREADS") == nil then
-        f:write("unset OMP_NUM_THREADS\n")
+        f:write(string.format("export OMP_NUM_THREADS=%d\n", tpp))
     else
         f:write(string.format("export OMP_NUM_THREADS=%s\n", os.getenv("OMP_NUM_THREADS")))
     end
@@ -1582,7 +1703,6 @@ function percentile_table(inputtable, skip_cols, skip_lines)
 end
 
 function printMpiOutput(group_list, all_results, regionname)
-    print(print_stats)
     region = regionname or nil
     if #group_list == 0 or likwid.tablelength(all_results) == 0 then
         return
@@ -1761,10 +1881,13 @@ local cmd_options = {"h","help", -- default options for help message
                      "m","marker", -- options to activate MarkerAPI
                      "e:", "env:", -- options to forward environment variables
                      "ld",         -- option to activate debugging in likwid-perfctr
+                     "dist:",      -- option to specifiy distance between two MPI processes
+                     "o:","output:", -- option to specifiy an output file
+                     "mpiopts:", -- option to specifiy MPI options forwarded to the underlying MPI
                      "nperdomain:","pin:","hostfile:","O","f", "stats"} -- other options
 
 for opt,arg in likwid.getopt(arg,  cmd_options) do
-    if (type(arg) == "string") then
+    if (type(arg) == "string") and opt ~= "mpiopts" then
         local s,e = arg:find("-")
         if s == 1 then
             print_stderr(string.format("ERROR: Argument %s to option -%s starts with invalid character -.", arg, opt))
@@ -1796,14 +1919,77 @@ for opt,arg in likwid.getopt(arg,  cmd_options) do
             os.exit(1)
         end
     elseif opt == "t" or opt == "tpp" then
-        tpp = tonumber(arg)
-        if tpp == nil then
+        if arg:match("%d+:%a+") then
+            t, order = arg:match("(%d+):(%a+)")
+            tpp = tonumber(t)
+            if tpp == nil then
+                print_stderr("Argument for -t/-tpp must be a number")
+                os.exit(1)
+            end
+            if tpp == 0 then
+                print_stderr("Cannot run with 0 threads, at least 1 is required, sanitizing tpp to 1")
+                tpp = 1
+            end
+            local valid_order = false
+            for _, o in pairs(tpp_orderings) do
+                if o == order then
+                    valid_order = true
+                    break
+                end
+            end
+            if valid_order then
+                tpp_ordering = order
+            end
+            print_stdout(tpp, tpp_ordering)
+        elseif arg:match("%d+") then
+            tpp = tonumber(arg)
+            if tpp == nil then
+                print_stderr("Argument for -t/-tpp must be a number")
+                os.exit(1)
+            end
+            if tpp == 0 then
+                print_stderr("Cannot run with 0 threads, at least 1 is required, sanitizing tpp to 1")
+                tpp = 1
+            end
+        else
             print_stderr("Argument for -t/-tpp must be a number")
             os.exit(1)
         end
-        if tpp == 0 then
-            print_stderr("Cannot run with 0 threads, at least 1 is required, sanitizing tpp to 1")
-            tpp = 1
+    elseif opt == "dist" then
+        if arg:match("%d+:%a+") then
+            t, order = arg:match("(%d+):(%a+)")
+            local valid_order = false
+            for _, o in pairs(tpp_orderings) do
+                if o == order then
+                    valid_order = true
+                    break
+                end
+            end
+            if valid_order then
+                tpp_ordering = order
+            end
+            dist = tonumber(t)
+            if dist == nil then
+                print_stderr("Argument for -dist must be a number or number:ordering")
+                os.exit(1)
+            end
+            if dist == 0 then
+                print_stderr("Cannot run with distance 0, at least 1 is required, sanitizing dist to 1")
+                dist = 1
+            end
+        elseif arg:match("%d+") then
+            dist = tonumber(arg)
+            if dist == nil then
+                print_stderr("Argument for -dist must be a number or number:ordering")
+                os.exit(1)
+            end
+            if dist == 0 then
+                print_stderr("Cannot run with distance 0, at least 1 is required, sanitizing dist to 1")
+                dist = 1
+            end
+        else
+            print_stderr("Argument for -dist must be a number or number:ordering")
+            os.exit(1)
         end
     elseif opt == "nperdomain" then
         local domain, count, threads = arg:match("([NSCM]):(%d+)[:]*(%d*)")
@@ -1839,14 +2025,21 @@ for opt,arg in likwid.getopt(arg,  cmd_options) do
         omptype = arg
     elseif opt == "ld" then
         likwiddebug = true
+    elseif opt == "o" or opt == "output" then
+        outfile = arg
+        print_stderr("WARN: The output file option is currently ignored. Will be available in upcoming releases")
     elseif opt == "s" or opt == "skip" then
         skipStr = "-s "..arg
+    elseif opt == "mpiopts" then
+        mpiopts = tostring(arg)
     elseif opt == "?" then
         print_stderr("Invalid commandline option -"..arg)
         os.exit(1)
     elseif opt == "!" then
         print_stderr("Option requires an argument")
         os.exit(1)
+    elseif opt == "-" then
+        break
     end
 end
 
@@ -1861,49 +2054,21 @@ if use_marker and #perf == 0 then
     os.exit(1)
 end
 
-local test_mpiOpts = false
-for i=1,#arg do
-    if arg[i] == '--' then
-        test_mpiOpts = true
-    end
-    if not test_mpiOpts then
-        table.insert(executable, arg[i])
-    elseif arg[i] ~= '--' then
-        table.insert(mpiopts, arg[i])
-    end
+for _,x in pairs(arg) do
+    table.insert(executable, x)
 end
 
 if #executable == 0 then
     print_stderr("ERROR: No executable given on commandline")
     os.exit(1)
-else
-    local do_which = false
-    local found = false
-    if likwid.access(executable[1], "x") == -1 then
-        do_which = true
-    else
-        found = true
-    end
-    if not found then
-        if do_which then
-            local f = io.popen(string.format("which %s 2>/dev/null", executable[1]))
-            if f ~= nil then
-                executable[1] = f:read("*line")
-                f:close()
-                found = true
-            end
-            if debug then
-                print_stdout("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
-            end
-        end
-    end
-    if not found then
-        print_stderr("ERROR: Cannot find executable given on commandline")
-        os.exit(1)
-    end
 end
-if #mpiopts > 0 and debug then
-    print_stdout("DEBUG: MPI options given on commandline: "..table.concat(mpiopts, " "))
+
+if debug then
+    print_stdout("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
+end
+
+if mpiopts and mpiopts:len() > 0 and debug then
+    print_stdout("DEBUG: MPI options given on commandline: "..mpiopts)
 end
 
 if mpitype == nil then
@@ -2030,25 +2195,28 @@ if #cpuexprs > 0 then
         print_stderr(string.format("ERROR: You want %d processes but the pinning expression has only expressions for %d processes. There are only %d hosts in the host list.", np, #cpuexprs*#newhosts, #newhosts))
         os.exit(1)
     end
-else 
+else
     ppn = math.tointeger(np / givenNrNodes)
     if nperdomain == nil then
         nperdomain = "N:"..tostring(ppn)
         if tpp > 0 then
             nperdomain = nperdomain..":"..tostring(tpp)
+            if dist > 1 then
+                nperdomain = nperdomain..":"..tostring(dist)
+            end
         end
     end
-    domainname, count, threads = nperdomain:match("[E]*[:]*([NSCM]*):(%d+)[:]*(%d*)")
+    domainname, count, threads, distance = nperdomain:match("[E]*[:]*([NSCM]*):(%d+)[:]*(%d*)[:]*(%d*)")
     if math.tointeger(threads) == nil then
         if tpp > 1 then
-            nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp)
+            nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp, dist)
         else
             tpp = 1
-            nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp)
+            nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp, dist)
         end
     else
         tpp = math.tointeger(threads)
-        nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp)
+        nperdomain = string.format("E:%s:%d:%d", domainname, count, tpp, dist)
     end
     cpuexprs = calculateCpuExprs(nperdomain, cpuexprs)
     if debug then
@@ -2066,7 +2234,7 @@ else
         end
         for i=np+1,ppn do
             if debug then
-                print_stderr("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
+                print_stderr("WARN: Remove cpuexpr: "..table.concat(cpuexprs[#cpuexprs], ","))
             end
             table.remove(cpuexprs, #cpuexprs)
         end
diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua
index f5de7f097..4ad4f06c4 100644
--- a/src/applications/likwid-perfctr.lua
+++ b/src/applications/likwid-perfctr.lua
@@ -85,7 +85,7 @@ local function usage()
     io.stdout:write("-i, --info\t\t Print CPU info\n")
     io.stdout:write("-T <time>\t\t Switch eventsets with given frequency\n")
     io.stdout:write("-f, --force\t\t Force overwrite of registers if they are in use\n")
-    io.stdout:write("Modes:")
+    io.stdout:write("Modes:\n")
     io.stdout:write("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms\n")
     io.stdout:write("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms\n")
     io.stdout:write("\t\t\t The output format (to stderr) is:\n")
@@ -692,6 +692,7 @@ if print_info or verbose > 0 then
     print_stdout(string.format("CPU short:\t%s", cpuinfo["short_name"]))
     print_stdout(string.format("CPU stepping:\t%u", cpuinfo["stepping"]))
     print_stdout(string.format("CPU features:\t%s", cpuinfo["features"]))
+    print_stdout(string.format("CPU arch:\t%s", cpuinfo["architecture"]))
     P6_FAMILY = 6
     if cpuinfo["family"] == P6_FAMILY and cpuinfo["perf_version"] > 0 then
         print_stdout(likwid.hline)
@@ -934,8 +935,6 @@ for i, event_string in pairs(event_string_list) do
         end
         local gid = likwid.addEventSet(event_string)
         if gid < 0 then
-            likwid.putTopology()
-            likwid.putConfiguration()
             likwid.finalize()
             os.exit(1)
         end
@@ -958,8 +957,6 @@ if gpusSupported then
 end
 if #group_ids == 0 and not (#gpu_event_string_list > 0 and use_marker) then
     print_stderr("ERROR: No valid eventset given on commandline. Exiting...")
-    likwid.putTopology()
-    likwid.putConfiguration()
     likwid.finalize()
     os.exit(1)
 end
@@ -1136,8 +1133,6 @@ if not use_marker then
     if ret < 0 then
         print_stderr(string.format("Error stopping counters for thread %d.",ret * (-1)))
         likwid.finalize()
-        likwid.putTopology()
-        likwid.putConfiguration()
         os.exit(exitvalue)
     end
 end
@@ -1231,7 +1226,4 @@ end
 --    likwid.gpuFinalize()
 --end
 likwid.finalize()
-likwid.putTopology()
-likwid.putNumaInfo()
-likwid.putConfiguration()
 os.exit(exitvalue)
diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua
index d1d88bf5d..61f92cea0 100644
--- a/src/applications/likwid-pin.lua
+++ b/src/applications/likwid-pin.lua
@@ -78,6 +78,7 @@ local function usage()
     print_stdout("-v, --version\t\t Version information")
     print_stdout("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
     print_stdout("-i\t\t\t Set numa interleave policy with all involved numa nodes")
+    print_stdout("-m\t\t\t Set numa membind policy with all involved numa nodes")
     print_stdout("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
     print_stdout("-c/-C <list>\t\t Comma separated processor IDs or expression")
     print_stdout("-s, --skip <hex>\t Bitmask with threads to skip")
@@ -101,6 +102,7 @@ delimiter = ','
 quiet = 0
 sweep_sockets = false
 interleaved_policy = false
+membind_policy = false
 print_domains = false
 cpu_list = {}
 skip_mask = nil
@@ -115,7 +117,7 @@ if (#arg == 0) then
     os.exit(0)
 end
 
-for opt,arg in likwid.getopt(arg, {"c:", "C:", "d:", "h", "i", "p", "q", "s:", "S", "t:", "v", "V:", "verbose:", "help", "version", "skip","sweep", "quiet"}) do
+for opt,arg in likwid.getopt(arg, {"c:", "C:", "d:", "h", "i", "m", "p", "q", "s:", "S", "t:", "v", "V:", "verbose:", "help", "version", "skip","sweep", "quiet"}) do
     if opt == "h" or opt == "help" then
         usage()
         close_and_exit(0)
@@ -133,6 +135,8 @@ for opt,arg in likwid.getopt(arg, {"c:", "C:", "d:", "h", "i", "p", "q", "s:", "
         sweep_sockets = true
     elseif (opt == "i") then
         interleaved_policy = true
+    elseif (opt == "m") then
+        membind_policy = true
     elseif (opt == "p") then
         print_domains = true
     elseif opt == "s" or opt == "skip" then
@@ -216,6 +220,16 @@ if interleaved_policy then
         print_stdout("No need to set mem_policy to interleaved, only one NUMA node available")
     end
 end
+if membind_policy then
+    if numainfo["numberOfNodes"] > 1 then
+        if verbose > 0 and quiet == 0 then
+            print_stdout("Set mem_policy to membind")
+        end
+        likwid.setMembind(num_threads, cpu_list)
+    else
+        print_stdout("No need to set mem_policy to membind, only one NUMA node available")
+    end
+end
 
 if sweep_sockets then
     if verbose > 0 and quiet == 0 then
@@ -279,7 +293,7 @@ if verbose > 0 and quiet == 0 then
     for _, c in pairs(cpu_list) do
         -- Check whether the Lua version has bit32 module (>= 5.2)
         if not bit32 then
-            mask = mask | (1<<c)
+            mask = mask + 2 ^ c
         else
             mask = bit32.bor(mask, bit32.lshift(1, c))
         end
diff --git a/src/applications/likwid-setFrequencies.lua b/src/applications/likwid-setFrequencies.lua
index 13456149f..2c4d589be 100644
--- a/src/applications/likwid-setFrequencies.lua
+++ b/src/applications/likwid-setFrequencies.lua
@@ -51,7 +51,8 @@ function usage()
     print_stdout("-h\t\t Help message")
     print_stdout("-v\t\t Version information")
     print_stdout("-V <0-3>\t Verbosity (0=only_error, 3=developer)")
-    print_stdout("-c dom\t\t Likwid thread domain which to apply settings (default are all CPUs)")
+    print_stdout("-c dom\t\t CPU selection or LIKWID thread domain")
+    print_stdout("\t\t Default behavior is to apply the frequencies to all CPUs.")
     print_stdout("\t\t See likwid-pin -h for details")
     print_stdout("-g gov\t\t Set governor (" .. table.concat(likwid.getAvailGovs(0), ", ") .. ")")
     print_stdout("-f/--freq freq\t Set minimal and maximal CPU frequency")
@@ -73,9 +74,6 @@ function usage()
     print_stdout("\t acpi-cpufreq driver: set the userspace governor implicitly")
     print_stdout("\t intel_pstate driver: keep current governor")
     print_stdout("")
-    print_stdout("The min/max frequencies can be slightly off with the intel_pstate driver as")
-    print_stdout("the value is calculated while the current frequency is read from sysfs.")
-    print_stdout("")
     print_stdout("In general the min/max Uncore frequency can be set freely, even to 0 or 1E20")
     print_stdout("but the hardware stays inside its limits. LIKWID reduces the range of possible")
     print_stdout("frequencies to the minimal core frequency (likwid-setFrequencies -l) and the ")
@@ -104,12 +102,24 @@ function round(x)
         if s:sub(slen,slen) ~= "0" then break end
         slen = slen - 1
     end
-    if slen > 5 then
-        slen = 5
-    end
+    if s:sub(slen,slen) == "." then slen = slen - 1 end
     return s:sub(1, slen)
 end
 
+function valid_freq(freq, freq_list, turbofreq)
+    local valid_freq = false
+    for k,v in pairs(freq_list) do
+        if (freq == v) then
+            valid_freq = true
+            break
+        end
+    end
+    if (not valid_freq) and freq == turbofreq then
+        valid_freq = true
+    end
+    return valid_freq
+end
+
 verbosity = 0
 governor = nil
 frequency = nil
@@ -125,6 +135,7 @@ do_reset = false
 do_ureset = false
 set_turbo = false
 turbo = 0
+driver = nil
 
 if #arg == 0 then
     usage()
@@ -144,13 +155,13 @@ for opt,arg in likwid.getopt(arg, {"V:", "g:", "c:", "f:", "l", "p", "h", "v", "
     elseif (opt == "g") then
         governor = arg
     elseif opt == "f" or opt == "freq" then
-        frequency = arg
-        min_freq = arg
-        max_freq = arg
+        frequency = arg*1E6
+        min_freq = arg*1E6
+        max_freq = arg*1E6
     elseif opt == "x" or opt == "min" then
-        min_freq = arg
+        min_freq = arg*1E6
     elseif opt == "y" or opt == "max" then
-        max_freq = arg
+        max_freq = arg*1E6
     elseif opt == "t" or opt == "turbo" then
         set_turbo = true
         local t = tonumber(arg)
@@ -163,6 +174,7 @@ for opt,arg in likwid.getopt(arg, {"V:", "g:", "c:", "f:", "l", "p", "h", "v", "
         local s = tonumber(arg)
         if (s >= 0 and s <= 3) then
             verbosity = s
+            likwid.setVerbosity(s)
         else
             print_stderr(string.format("ERROR: Value %s for verbosity not valid", arg))
         end
@@ -189,6 +201,7 @@ for opt,arg in likwid.getopt(arg, {"V:", "g:", "c:", "f:", "l", "p", "h", "v", "
     end
 end
 
+cpuinfo = likwid.getCpuInfo()
 topo = likwid.getCpuTopology()
 affinity = likwid.getAffinityInfo()
 if not domain or domain == "N" then
@@ -229,6 +242,9 @@ for i, dom in pairs(affinity["domains"]) do
     end
 end
 
+likwid.initFreq()
+driver = likwid.getFreqDriver(cpulist[1])
+
 if verbosity == 3 then
     print_stdout(string.format("DEBUG: Given CPU expression expands to %d CPU cores:", numthreads))
     local str = "DEBUG: " .. tostring(cpulist[1])
@@ -247,6 +263,7 @@ end
 
 if printAvailGovs then
     local govs = likwid.getAvailGovs(0)
+    govs = likwid.getAvailGovs(0)
     if #govs > 0 then
         print_stdout("Available governors:")
         print_stdout(string.format("%s", table.concat(govs, " ")))
@@ -259,9 +276,22 @@ if printAvailFreq then
     local freqs = likwid.getAvailFreq(0)
     if #freqs > 0 then
         print_stdout("Available frequencies:")
-        print_stdout(string.format("%s", table.concat(freqs, " ")))
+        outfreqs = {}
+        for _, f in pairs(freqs) do
+            table.insert(outfreqs, round(tonumber(f)/1.E6))
+        end
+        print_stdout(string.format("%s", table.concat(outfreqs, " ")))
     else
         print_stdout("Cannot get frequencies from cpufreq module")
+        if driver == "intel_pstate" then
+            freqs = {}
+            min = tonumber(likwid.getConfCpuClockMin(cpulist[1]))/1E6
+            max = tonumber(likwid.getConfCpuClockMax(cpulist[1]))/1E6
+            print_stdout("The intel_pstate module allows free selection of frequencies in the available range")
+            print_stdout(string.format("Minimal CPU frequency %s", round(min)))
+            print_stdout(string.format("Maximal CPU frequency %s", round(max)))
+            os.exit(0)
+        end
     end
 end
 
@@ -270,9 +300,9 @@ if printCurFreq then
     local processed = 0
     for i=1,#cpulist do
         gov = likwid.getGovernor(cpulist[i])
-        freq = tonumber(likwid.getCpuClockCurrent(cpulist[i]))/1E9
-        min = tonumber(likwid.getCpuClockMin(cpulist[i]))/1E9
-        max = tonumber(likwid.getCpuClockMax(cpulist[i]))/1E9
+        freq = tonumber(likwid.getCpuClockCurrent(cpulist[i]))/1E6
+        min = tonumber(likwid.getCpuClockMin(cpulist[i]))/1E6
+        max = tonumber(likwid.getCpuClockMax(cpulist[i]))/1E6
         t = tonumber(likwid.getTurbo(cpulist[i]));
         if gov and freq and min and max and t >= 0 then
             processed = processed + 1
@@ -299,60 +329,111 @@ if printCurFreq then
 end
 
 if printAvailGovs or printAvailFreq or printCurFreq then
+    likwid.finalizeFreq()
     os.exit(0)
 end
 
 if do_reset then
-    local f = likwid.setTurbo(cpulist[1], 0)
     local availfreqs = likwid.getAvailFreq(cpulist[1])
     local availgovs = likwid.getAvailGovs(cpulist[1])
+    if driver == "intel_pstate" then
+        availfreqs = {likwid.getConfCpuClockMin(cpulist[1]), likwid.getConfCpuClockMax(cpulist[1])}
+    end
     if not min_freq then
         min_freq = availfreqs[1]
     end
+    if min_freq >= availfreqs[#availfreqs] then
+        min_freq = availfreqs[#availfreqs]
+    end
     if not (set_turbo or max_freq) then
         set_turbo = true
         turbo = 0
         max_freq = availfreqs[#availfreqs]
+        if max_freq <= availfreqs[1] then
+            max_freq = availfreqs[1]
+        end
     end
     if not governor then
-        governor = availgovs[#availgovs]
+        governor = nil
+        for i, g in pairs(availgovs) do
+            if g:match("^performance") then
+                governor = g
+                break
+            end
+        end
+        if not governor then
+            for i, g in pairs(availgovs) do
+                if g:match("^conservative") then
+                    governor = g
+                    break
+                end
+            end
+            if not governor then
+                governor = availgovs[#availgovs]
+            end
+        end
     end
     if min_freq and governor then
-        print_stdout(string.format("Reset to governor %s with min freq. %g GHz and deactivate turbo mode", governor, min_freq))
+        print_stdout(string.format("Reset to governor %s with min freq. %s GHz and deactivate turbo mode", governor, round(min_freq/1E6)))
     end
 end
 
 if do_ureset then
-    local availfreqs = likwid.getAvailFreq(cpulist[1])
-    local power = likwid.getPowerInfo()
-    local minf = tonumber(availfreqs[1])
-    local maxf = tonumber(power["turbo"]["steps"][1]) / 1000
-    min_u_freq = minf
-    max_u_freq = maxf
+    if cpuinfo["isIntel"] == 1 then
+        local availfreqs = likwid.getAvailFreq(cpulist[1])
+        if #availfreqs == 0 then
+            availfreqs = {likwid.getConfCpuClockMin(cpulist[1]), likwid.getConfCpuClockMax(cpulist[1])}
+        end
+        local power = likwid.getPowerInfo()
+        local minf = tonumber(availfreqs[1]/1E6)
+        if (minf > tonumber(availfreqs[#availfreqs]/1E6)) then
+            minf = tonumber(availfreqs[#availfreqs]/1E6)
+        end
+        local maxf = tonumber(power["turbo"]["steps"][1]) / 1000
+        if (minf > maxf) then
+            local s = minf
+            minf = maxf
+            maxf = s
+        end
+        min_u_freq = minf
+        max_u_freq = maxf
+    else
+        print_stderr("ERROR: AMD CPUs provide no interface to manipulate the Uncore frequency.")
+        likwid.finalizeFreq()
+        os.exit(1)
+    end
 end
 
 if numthreads > 0 and not (frequency or min_freq or max_freq or governor or min_u_freq or max_u_freq or set_turbo) then
     print_stderr("ERROR: You need to set either a frequency or governor for the selected CPUs on commandline")
+    likwid.finalizeFreq()
     os.exit(1)
 end
 
 if min_freq and max_freq and min_freq > max_freq then
     print_stderr("ERROR: Minimal CPU frequency higher than maximal frequency.")
+    likwid.finalizeFreq()
     os.exit(1)
 end
 if min_freq and max_freq and max_freq < min_freq then
     print_stderr("ERROR: Maximal CPU frequency lower than minimal frequency.")
+    likwid.finalizeFreq()
     os.exit(1)
 end
 if min_u_freq and max_u_freq and max_u_freq < min_u_freq then
     print_stderr("ERROR: Maximal Uncore frequency lower than minimal frequency.")
+    likwid.finalizeFreq()
     os.exit(1)
 end
 
 
-local availfreqs = likwid.getAvailFreq(cpulist[i])
-if (frequency or min_freq or max_freq) and #availfreqs == 0 then
+local availfreqs = likwid.getAvailFreq(cpulist[1])
+if driver == "intel_pstate" then
+    availfreqs = {likwid.getConfCpuClockMin(cpulist[1])/1E6, likwid.getConfCpuClockMax(cpulist[1])/1E6}
+end
+if (frequency or min_freq or max_freq) and #availfreqs == 0 and likwid.getFreqDriver(cpulist[1]) ~= "intel_pstate" then
     print_stdout("Cannot set CPU frequency, cpufreq module not properly loaded")
+    likwid.finalizeFreq()
     os.exit(1)
 end
 local savailfreqs = {}
@@ -360,96 +441,120 @@ for i,f in pairs(availfreqs) do
     savailfreqs[i] = round(f)
 end
 if verbosity == 3 then
-    print_stdout("DEBUG Available freq.: "..table.concat(availfreqs, ", "))
+    print_stdout("DEBUG Available freq.: "..table.concat(savailfreqs, ", "))
+end
+if driver ~= "intel_pstate" then
+    if max_freq then
+        local test_freq = round(tonumber(max_freq))
+        if not valid_freq(test_freq, savailfreqs, availturbo) then
+            print_stderr(string.format("ERROR: Selected max. frequency %s not available! Please select one of\n%s", test_freq, table.concat(savailfreqs, ", ")))
+            likwid.finalizeFreq()
+            os.exit(1)
+        end
+    end
+    if min_freq then
+        local test_freq = round(tonumber(min_freq))
+        if not valid_freq(test_freq, savailfreqs, availturbo) then
+            print_stderr(string.format("ERROR: Selected min. frequency %s not available! Please select one of\n%s", test_freq, table.concat(savailfreqs, ", ")))
+            likwid.finalizeFreq()
+            os.exit(1)
+        end
+    end
 end
 
+min_first = false
+max_first = false
+if min_freq and tonumber(min_freq)/1E6 > tonumber(likwid.getCpuClockMax(cpulist[i]))/1E6 then
+    max_first = true
+end
+if max_freq and tonumber(max_freq)/1E6 < tonumber(likwid.getCpuClockMin(cpulist[i]))/1E6 then
+    min_first = true
+end
 
-for x=1,2 do
-    if min_freq then
-        for i=1,#cpulist do
-            local valid_freq = false
-            for k,v in pairs(savailfreqs) do
-                if (tonumber(min_freq) == tonumber(v)) then
-                    if verbosity == 3 then
-                        print_stdout(string.format("DEBUG: Min frequency %g valid", min_freq))
-                    end
-                    valid_freq = true
-                    break
-                end
-            end
-            if min_freq == availturbo then
-                valid_freq = true
-            end
-            if not valid_freq then
-                print_stderr(string.format("ERROR: Selected min. frequency %s not available for CPU %d! Please select one of\n%s", min_freq, cpulist[i], table.concat(savailfreqs, ", ")))
-                os.exit(1)
-            end
-            if verbosity == 3 then
-                print_stdout(string.format("DEBUG: Set min. frequency for CPU %d to %d", cpulist[i], tonumber(min_freq)*1E6))
-            end
-            local f = likwid.setCpuClockMin(cpulist[i], tonumber(min_freq)*1E6)
+if set_turbo then
+    for i=1,#cpulist do
+        if verbosity == 3 then
+            print_stdout(string.format("DEBUG: Set turbo mode for CPU %d to %d", cpulist[i], turbo))
         end
+        local f = likwid.setTurbo(cpulist[i], turbo)
     end
+end
 
 
-    if set_turbo then
+if max_first and max_freq then
+    for i=1,#cpulist do
+        local f = likwid.setCpuClockMax(cpulist[i], max_freq)
+    end
+    if min_freq then
         for i=1,#cpulist do
-            if verbosity == 3 then
-                print_stdout(string.format("DEBUG: Set turbo mode for CPU %d to %d", cpulist[i], turbo))
-            end
-            local f = likwid.setTurbo(cpulist[i], turbo)
+            local f = likwid.setCpuClockMin(cpulist[i], min_freq)
         end
     end
+elseif min_first and min_freq then
+    for i=1,#cpulist do
+        local f = likwid.setCpuClockMin(cpulist[i], min_freq)
+    end
     if max_freq then
         for i=1,#cpulist do
-            local valid_freq = false
-            for k,v in pairs(savailfreqs) do
-                if (tonumber(max_freq) == tonumber(v)) then
-                    if verbosity == 3 then
-                        print_stdout(string.format("DEBUG: Max frequency %g valid", max_freq))
-                    end
-                    valid_freq = true
-                    break
-                end
-            end
-            if max_freq == availturbo then
-                valid_freq = true
-            end
-            if not valid_freq then
-                print_stderr(string.format("ERROR: Selected max. frequency %s not available for CPU %d! Please select one of\n%s", max_freq, cpulist[i], table.concat(savailfreqs, ", ")))
-                os.exit(1)
-            end
-            if verbosity == 3 then
-                print_stdout(string.format("DEBUG: Set max. frequency for CPU %d to %d", cpulist[i], tonumber(max_freq)*1E6))
-            end
-            local f = likwid.setCpuClockMax(cpulist[i], tonumber(max_freq)*1E6)
+            local f = likwid.setCpuClockMax(cpulist[i], max_freq)
         end
     end
+else
+    for i=1,#cpulist do
+        local f = likwid.setCpuClockMin(cpulist[i], min_freq)
+        local f = likwid.setCpuClockMax(cpulist[i], max_freq)
+    end
 end
 
 if min_u_freq then
-    for s=1,#socklist do
-        socket = socklist[s]
-        if verbosity == 3 then
-            print_stdout(string.format("DEBUG: Set min. uncore frequency for socket %d to %d MHz", socket, min_u_freq*1000))
-        end
-        local err = likwid.setUncoreFreqMin(socket, min_u_freq*1000);
-        if err ~= 0 then
-            print_stderr(string.format("Setting of minimal Uncore frequency %f failed on socket %d", tonumber(min_u_freq)*1000, socket))
+    test = likwid.getUncoreFreqMin(socklist[1])
+    if test == 0 then
+        print_stderr("ERROR: This CPU does not provide an interface to manipulate the Uncore frequency.")
+        min_u_freq = nil
+        os.exit(1)
+    end
+end
+if max_u_freq then
+    test = likwid.getUncoreFreqMax(socklist[1])
+    if test == 0 then
+        print_stderr("ERROR: This CPU does not provide an interface to manipulate the Uncore frequency.")
+        max_u_freq = nil
+        os.exit(1)
+    end
+end
+
+
+if min_u_freq then
+    if cpuinfo["isIntel"] == 1 then
+        for s=1,#socklist do
+            socket = socklist[s]
+            if verbosity == 3 then
+                print_stdout(string.format("DEBUG: Set min. uncore frequency for socket %d to %d MHz", socket, min_u_freq*1000))
+            end
+            local err = likwid.setUncoreFreqMin(socket, min_u_freq*1000);
+            if err ~= 0 then
+                print_stderr(string.format("Setting of minimal Uncore frequency %f failed on socket %d", tonumber(min_u_freq)*1000, socket))
+            end
         end
+    else
+        print_stderr("ERROR: AMD CPUs provide no interface to manipulate the Uncore frequency.")
     end
 end
 
 if max_u_freq then
-    for s=1,#socklist do
-        socket = socklist[s]
-        if verbosity == 3 then
-            print_stdout(string.format("DEBUG: Set max. uncore frequency for socket %d to %d MHz", socket, max_u_freq*1000))
-        end
-        local err = likwid.setUncoreFreqMax(socket, max_u_freq*1000);
-        if err ~= 0 then
-            print_stderr(string.format("Setting of maximal Uncore frequency %d failed on socket %d", tonumber(max_u_freq)*1000, socket))
+    if cpuinfo["isIntel"] == 1 then
+        for s=1,#socklist do
+            socket = socklist[s]
+            if verbosity == 3 then
+                print_stdout(string.format("DEBUG: Set max. uncore frequency for socket %d to %d MHz", socket, max_u_freq*1000))
+            end
+            local err = likwid.setUncoreFreqMax(socket, max_u_freq*1000);
+            if err ~= 0 then
+                print_stderr(string.format("Setting of maximal Uncore frequency %d failed on socket %d", tonumber(max_u_freq)*1000, socket))
+            end
         end
+    else
+        print_stderr("ERROR: AMD CPUs provide no interface to manipulate the Uncore frequency.")
     end
 end
 
@@ -475,21 +580,22 @@ if governor then
         end
     end
     local cur_freqs = {}
-    if not valid_gov then
-        print_stderr(string.format("ERROR: Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
-        os.exit(1)
-    end
-    for i=1,#cpulist do
-        if verbosity == 3 then
-            print_stdout(string.format("DEBUG: Set governor for CPU %d to %s", cpulist[i], governor))
-        end
-        local f = likwid.setGovernor(cpulist[i], governor)
-        if do_reset then
-            likwid.setCpuClockMin(cpulist[i], cur_min[i])
-            likwid.setCpuClockMax(cpulist[i], cur_max[i])
+    if valid_gov then
+        for i=1,#cpulist do
+            if verbosity == 3 then
+                print_stdout(string.format("DEBUG: Set governor for CPU %d to %s", cpulist[i], governor))
+            end
+            local f = likwid.setGovernor(cpulist[i], governor)
+            if do_reset then
+                likwid.setCpuClockMin(cpulist[i], cur_min[i])
+                likwid.setCpuClockMax(cpulist[i], cur_max[i])
+            end
         end
+    else
+        print_stderr(string.format("ERROR: Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
     end
 end
+likwid.finalizeFreq()
 likwid.putAffinityInfo()
 likwid.putTopology()
 os.exit(0)
diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua
index b27e8701e..d72b05abb 100644
--- a/src/applications/likwid.lua
+++ b/src/applications/likwid.lua
@@ -86,6 +86,7 @@ likwid.putTopology = likwid_putTopology
 likwid.getNumaInfo = likwid_getNumaInfo
 likwid.putNumaInfo = likwid_putNumaInfo
 likwid.setMemInterleaved = likwid_setMemInterleaved
+likwid.setMembind = likwid_setMembind
 likwid.getAffinityInfo = likwid_getAffinityInfo
 likwid.putAffinityInfo = likwid_putAffinityInfo
 likwid.getPowerInfo = likwid_getPowerInfo
@@ -135,6 +136,7 @@ likwid.registerRegion = likwid_registerRegion
 likwid.startRegion = likwid_startRegion
 likwid.stopRegion = likwid_stopRegion
 likwid.getRegion = likwid_getRegion
+likwid.resetRegion = likwid_resetRegion
 likwid.initCpuFeatures = likwid_cpuFeaturesInit
 likwid.getCpuFeatures = likwid_cpuFeaturesGet
 likwid.enableCpuFeatures = likwid_cpuFeaturesEnable
@@ -142,22 +144,26 @@ likwid.disableCpuFeatures = likwid_cpuFeaturesDisable
 likwid.readMarkerFile = likwid_readMarkerFile
 likwid.destroyMarkerFile = likwid_destroyMarkerFile
 likwid.markerNumRegions = likwid_markerNumRegions
-likwid.gpuMarkerRegionGroup = likwid_gpuMarkerRegionGroup
-likwid.gpuMarkerRegionTag = likwid_gpuMarkerRegionTag
-likwid.gpuMarkerRegionEvents = likwid_gpuMarkerRegionEvents
-likwid.gpuMarkerRegionCpulist = likwid_gpuMarkerRegionCpulist
-likwid.gpuMarkerRegionThreads = likwid_gpuMarkerRegionThreads
-likwid.gpuMarkerRegionTime = likwid_gpuMarkerRegionTime
-likwid.gpuMarkerRegionCount = likwid_gpuMarkerRegionCount
-likwid.gpuMarkerRegionResult = likwid_gpuMarkerRegionResult
-likwid.gpuMarkerRegionMetric = likwid_gpuMarkerRegionMetric
+likwid.markerRegionGroup = likwid_markerRegionGroup
+likwid.markerRegionTag = likwid_markerRegionTag
+likwid.markerRegionEvents = likwid_markerRegionEvents
+likwid.markerRegionCpulist = likwid_markerRegionCpulist
+likwid.markerRegionThreads = likwid_markerRegionThreads
+likwid.markerRegionTime = likwid_markerRegionTime
+likwid.markerRegionCount = likwid_markerRegionCount
+likwid.markerRegionResult = likwid_markerRegionResult
+likwid.markerRegionMetric = likwid_markerRegionMetric
+likwid.initFreq = likwid_initFreq
 likwid.getCpuClockCurrent = likwid_getCpuClockCurrent
 likwid.getCpuClockMin = likwid_getCpuClockMin
+likwid.getConfCpuClockMin = likwid_getConfCpuClockMin
 likwid.setCpuClockMin = likwid_setCpuClockMin
 likwid.getCpuClockMax = likwid_getCpuClockMax
+likwid.getConfCpuClockMax = likwid_getConfCpuClockMax
 likwid.setCpuClockMax = likwid_setCpuClockMax
 likwid.getGovernor = likwid_getGovernor
 likwid.setGovernor = likwid_setGovernor
+likwid.finalizeFreq = likwid_finalizeFreq
 likwid.setTurbo = likwid_setTurbo
 likwid.getTurbo = likwid_getTurbo
 likwid.setUncoreFreqMin = likwid_setUncoreFreqMin
@@ -230,7 +236,7 @@ local function getopt(args, ostrlist)
                     if #args[1] == 2 then -- found "--"
                         place = 0
                         table.remove(args, 1)
-                        return args[1], nil
+                        return "-", nil
                     end
                     place = place + 1
                 end
@@ -743,6 +749,7 @@ local function min_max_avg(values)
 end
 
 local function tableMinMaxAvgSum(inputtable, skip_cols, skip_lines)
+    local function isSpecial(s) return s == "nan" or s ~= "-" or s == "int" end
     local outputtable = {}
     local nr_columns = #inputtable
     if nr_columns == 0 then
@@ -765,14 +772,12 @@ local function tableMinMaxAvgSum(inputtable, skip_cols, skip_lines)
     for j=skip_cols+1,nr_columns do
         for i=skip_lines+1, nr_lines do
             local res = tonumber(inputtable[j][i])
-            if inputtable[j][i] ~= "nan" and inputtable[j][i] ~= "-" then
-                if res ~= nil then
-                    minOfLine[i-skip_lines+1] = math.min(res, minOfLine[i-skip_lines+1])
-                    maxOfLine[i-skip_lines+1] = math.max(res, maxOfLine[i-skip_lines+1])
-                    sumOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1] + res
-                end
-                avgOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1]/(nr_columns-skip_cols)
+            if res ~= nil then
+                minOfLine[i-skip_lines+1] = math.min(res, minOfLine[i-skip_lines+1])
+                maxOfLine[i-skip_lines+1] = math.max(res, maxOfLine[i-skip_lines+1])
+                sumOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1] + res
             end
+            avgOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1]/(nr_columns-skip_cols)
         end
     end
     for i=2,#minOfLine do
@@ -1218,9 +1223,13 @@ end
 likwid.gethostname = gethostname
 
 local function getjid()
-    local jid = os.getenv("PBS_JOBID")
-    if jid == nil then
-        jid = "X"
+    jid = "X"
+    for _, v in pairs({"PBS_JOBID", "SLURM_JOB_ID", "SLURM_JOBID", "LOADL_STEP_ID", "LSB_JOBID" }) do
+        x = os.getenv(v)
+        if x then
+            jid = x
+            break
+        end
     end
     return jid
 end
@@ -1228,11 +1237,12 @@ end
 likwid.getjid = getjid
 
 local function getMPIrank()
-    local rank = os.getenv("PMI_RANK")
-    if rank == nil then
-        rank = os.getenv("OMPI_COMM_WORLD_RANK")
-        if rank == nil then
-            rank = "X"
+    rank = "X"
+    for _, v in pairs({"PMI_RANK", "OMPI_COMM_WORLD_RANK", "SLURM_PROCID"}) do
+        x = os.getenv(v)
+        if x then
+            rank = x
+            break
         end
     end
     return rank
@@ -1240,6 +1250,17 @@ end
 
 likwid.getMPIrank = getMPIrank
 
+local function llikwid_getFreqDriver(cpu)
+    file = string.format("/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", cpu)
+    local f = io.open(file, "rb")
+    if f then
+        drv = f:read("*l"):gsub("%s+", "")
+        f:close()
+        return drv
+    end
+end
+
+likwid.getFreqDriver = llikwid_getFreqDriver
 
 local function llikwid_getAvailFreq(cpu)
     local freq_str = likwid_getAvailFreq(cpu)
diff --git a/src/bstrlib_helper.c b/src/bstrlib_helper.c
index 2fa268767..f092fc136 100644
--- a/src/bstrlib_helper.c
+++ b/src/bstrlib_helper.c
@@ -1,87 +1,111 @@
+<<<<<<< HEAD
 
+=======
+/*
+ * =======================================================================================
+ *
+ *      Filename:  bstrlib_helper.c
+ *
+ *      Description:  Additional functions to the bstrlib library
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
 
-#include <stdio.h>
-#include <stdlib.h>
-
-
-//#include <unistd.h>
-//#include <string.h>
 
 #include <bstrlib.h>
+#include <errno.h>
 
 
 int bstrListAdd(struct bstrList * sl, bstring str)
 {
-	if (sl->qty >= sl->mlen) {
+    if (sl->qty >= sl->mlen) {
         int mlen = sl->mlen * 2;
         bstring * tbl;
-    
-
-	    while (sl->qty >= mlen) {
-	        if (mlen < sl->mlen) return BSTR_ERR;
-	        mlen += mlen;
-	    }
+        while (sl->qty >= mlen) {
+            if (mlen < sl->mlen) return BSTR_ERR;
+            mlen += mlen;
+        }
 
-	    tbl = (bstring *) realloc (sl->entry, sizeof (bstring) * mlen);
-	    if (tbl == NULL) return BSTR_ERR;
+        tbl = (bstring *) realloc (sl->entry, sizeof (bstring) * mlen);
+        if (tbl == NULL) return BSTR_ERR;
 
-	    sl->entry = tbl;
-	    sl->mlen = mlen;
-	}
-	sl->entry[sl->qty] = bstrcpy(str);
+        sl->entry = tbl;
+        sl->mlen = mlen;
+    }
+    sl->entry[sl->qty] = bstrcpy(str);
     sl->qty++;
     return BSTR_OK;
 }
 
 int bstrListAddChar(struct bstrList * sl, char* str)
 {
-	if (!sl || !str) return BSTR_ERR;
-	bstring tmp = bformat("%s", str);
-	int err = bstrListAdd(sl, tmp);
-	bdestroy(tmp);
-	return err;
+    if (!sl || !str) return BSTR_ERR;
+    bstring tmp = bformat("%s", str);
+    int err = bstrListAdd(sl, tmp);
+    bdestroy(tmp);
+    return err;
 }
 
 void bstrListPrint(struct bstrList * sl)
 {
-	int i = 0;
-	if (!sl) return;
-	if (sl->qty > 0)
-	{
-		printf("[%s", bdata(sl->entry[0]));
-		for (i = 1; i < sl->qty; i++)
-		{
-			printf(", %s", bdata(sl->entry[i]));
-		}
-		printf("]\n");
-	}
-	else if (sl->qty == 0)
-	{
-		printf("[]\n");
-	}
+    int i = 0;
+    if (!sl) return;
+    if (sl->qty > 0)
+    {
+        printf("[%s", bdata(sl->entry[0]));
+        for (i = 1; i < sl->qty; i++)
+        {
+            printf(", %s", bdata(sl->entry[i]));
+        }
+        printf("]\n");
+    }
+    else if (sl->qty == 0)
+    {
+        printf("[]\n");
+    }
 }
 
 int bstrListDel(struct bstrList * sl, int idx)
 {
-	int i;
+    int i;
 
-	if (!sl || idx < 0 || idx >= sl->qty) return BSTR_ERR;
+    if (!sl || idx < 0 || idx >= sl->qty) return BSTR_ERR;
 
-	bdestroy(sl->entry[idx]);
+    bdestroy(sl->entry[idx]);
 
-	for (i = idx+1; i < sl->qty; i++)
-	{
-		sl->entry[i-1] = bstrcpy(sl->entry[i]);
-	}
-	sl->qty--;
+    for (i = idx+1; i < sl->qty; i++)
+    {
+        sl->entry[i-1] = bstrcpy(sl->entry[i]);
+    }
+    sl->qty--;
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 bstring bstrListGet(struct bstrList * sl, int idx)
 {
-	if (!sl || idx < 0 || idx >= sl->qty) return NULL;
-	return sl->entry[idx];
+    if (!sl || idx < 0 || idx >= sl->qty) return NULL;
+    return sl->entry[idx];
 }
 
 /*
@@ -121,3 +145,29 @@ int bisnumber(bstring b)
     }
     return count == blength(b);
 }
+
+bstring read_file(char *filename)
+{
+    int ret = 0;
+    FILE* fp = NULL;
+    char buf[BUFSIZ];
+    bstring content = bfromcstr("");
+    fp = fopen(filename, "r");
+    if (fp == NULL) {
+        fprintf(stderr, "fopen(%s): errno=%d\n", filename, errno);
+        return content;
+    }
+    for (;;) {
+        /* Read another chunk */
+        ret = fread(buf, 1, sizeof(buf), fp);
+        if (ret < 0) {
+            fprintf(stderr, "fread(%p, 1, %lu, %p): %d, errno=%d\n", buf, sizeof(buf), fp, ret, errno);
+            return content;
+        }
+        else if (ret == 0) {
+            break;
+        }
+        bcatblk(content, buf, ret);
+    }
+    return content;
+}
diff --git a/src/calculator.c b/src/calculator.c
new file mode 100644
index 000000000..5b35633dd
--- /dev/null
+++ b/src/calculator.c
@@ -0,0 +1,1133 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator.c
+ *
+ *      Description:  Infix calculator
+ *
+ *      Version:   4.2
+ *      Released:  22.12.2016
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) Brandon Mills
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * =======================================================================================
+ */
+/*
+ * =======================================================================================
+ *
+ *      Some changes done for the integration in LIKWID, see inline comments
+ *
+ *      Version:   4.2
+ *      Released:  22.12.2016
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig@gmail.com
+ *                Thomas Roehl (tr), thomas.roehl@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h> // Temporary
+#include <getopt.h>
+#include <calculator_stack.h>
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define bool char
+#define true 1
+#define false 0
+#define PI 3.141592653589793
+
+#ifndef NAN
+#define NAN (0.0/0.0)
+#endif
+
+#ifndef INFINITY
+#define INFINITY (1.0/0.0)
+#endif
+
+/* Added by Thomas Roehl (Thomas.Roehl@fau.de) to reduce reallocs by allocating a temporary
+ * token for parsing as well as for transforming a number to a string.
+ */
+#define MAXTOKENLENGTH 512
+#define MAXPRECISION 20
+#define DEFAULTPRECISION 5
+#define AUTOPRECISION -1
+#define FUNCTIONSEPARATOR "|"
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+typedef enum
+{
+    addop,
+    multop,
+    expop,
+    lparen,
+    rparen,
+    digit,
+    value,
+    decimal,
+    space,
+    text,
+    function,
+    identifier,
+    argsep,
+    invalid
+} Symbol;
+
+struct Preferences
+{
+    struct Display
+    {
+        bool tokens;
+        bool postfix;
+    } display;
+    struct Mode
+    {
+        bool degrees;
+    } mode;
+    int precision;
+    int maxtokenlength;
+} prefs;
+
+typedef enum
+{
+    divZero,
+    overflow,
+    parenMismatch,
+    inputMissing,
+} Error;
+
+typedef char* token;
+
+typedef double number;
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void raise(Error err)
+{
+    char* msg;
+    switch(err)
+    {
+        case divZero:
+            msg = "Divide by zero";
+            break;
+        case overflow:
+            msg = "Overflow";
+            break;
+        case parenMismatch:
+            msg = "Mismatched parentheses";
+            break;
+        case inputMissing:
+            msg = "Function input missing";
+            break;
+    }
+    //printf("\tError: %s\n", msg);
+}
+
+inline unsigned int
+toDigit(char ch)
+{
+    return ch - '0';
+}
+
+number buildNumber(token str)
+{
+    number result = 0;
+    /*while(*str && *str != '.')
+    {
+        result = result * 10 + toDigit(*str++);
+    }*/
+    result = strtod(str, NULL);
+    return result;
+}
+
+token num2Str(number num)
+{
+    int len = 0;
+    int precision = MAXPRECISION;
+    if (prefs.precision >= 0 && prefs.precision < precision)
+        precision = prefs.precision;
+    token str = (token)malloc(prefs.maxtokenlength*sizeof(char));
+    len = snprintf(str, prefs.maxtokenlength-1, "%.*f", precision, num);
+    if (prefs.precision == AUTOPRECISION)
+    {
+        while (str[len-1] == '0')
+        {
+            len = snprintf(str, prefs.maxtokenlength-1, "%.*f", --precision, num);
+        }
+    }
+
+    return str;
+}
+
+number
+toRadians(number degrees)
+{
+    return degrees * PI / 180.0;
+}
+
+number
+toDegrees(number radians)
+{
+    return radians * 180.0 / PI;
+}
+
+int doFunc(Stack *s, token function)
+{
+    if (stackSize(s) == 0)
+    {
+        raise(inputMissing);
+        stackPush(s, num2Str(NAN));
+        return -1;
+    }
+    else if (stackSize(s) == 1 && strcmp(stackTop(s), FUNCTIONSEPARATOR) == 0)
+    {
+        stackPop(s);
+        raise(inputMissing);
+        stackPush(s, num2Str(NAN));
+        return -1;
+    }
+    token input = (token)stackPop(s);
+    number num = buildNumber(input);
+    number result = num;
+    number counter = 0;
+
+    if(strncmp(function, "abs", 3) == 0)
+        result = fabs(num);
+    else if(strncmp(function, "floor", 5) == 0)
+        result = floor(num);
+    else if(strncmp(function, "ceil", 4) == 0)
+        result = ceil(num);
+    else if(strncmp(function, "sin", 3) == 0)
+        result = !prefs.mode.degrees ? sin(num) : sin(toRadians(num));
+    else if(strncmp(function, "cos", 3) == 0)
+        result = !prefs.mode.degrees ? cos(num) : cos(toRadians(num));
+    else if(strncmp(function, "tan", 3) == 0)
+        result = !prefs.mode.degrees ? tan(num) : tan(toRadians(num));
+    else if(strncmp(function, "arcsin", 6) == 0
+         || strncmp(function, "asin", 4) == 0)
+        result = !prefs.mode.degrees ? asin(num) : toDegrees(asin(num));
+    else if(strncmp(function, "arccos", 6) == 0
+         || strncmp(function, "acos", 4) == 0)
+        result = !prefs.mode.degrees ? acos(num) : toDegrees(acos(num));
+    else if(strncmp(function, "arctan", 6) == 0
+         || strncmp(function, "atan", 4) == 0)
+        result = !prefs.mode.degrees ? atan(num) : toDegrees(atan(num));
+    else if(strncmp(function, "sqrt", 4) == 0)
+        result = sqrt(num);
+    else if(strncmp(function, "cbrt", 4) == 0)
+        result = cbrt(num);
+    else if(strncmp(function, "log", 3) == 0)
+        result = log(num);
+    else if(strncmp(function, "exp", 3) == 0)
+        result = exp(num);
+    else if(strncmp(function, "min", 3) == 0)
+    {
+        while (stackSize(s) > 0 && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            num = buildNumber(input);
+            if (num < result)
+                result = num;
+        }
+    }
+    else if(strncmp(function, "max", 3) == 0)
+    {
+        while (stackSize(s) > 0 && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            num = buildNumber(input);
+            if (num > result)
+                result = num;
+        }
+    }
+    else if(strncmp(function, "sum", 3) == 0)
+    {
+        while (stackSize(s) > 0  && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            num = buildNumber(input);
+            result += num;
+        }
+    }
+    else if(strncmp(function, "avg", 3) == 0 ||
+            strncmp(function, "mean", 4) == 0)
+    {
+        // Result already initialized with first number
+        counter = 1;
+        while (stackSize(s) > 0  && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            num = buildNumber(input);
+            result += num;
+            counter++;
+        }
+        result /= counter;
+    }
+    else if(strncmp(function, "median", 6) == 0)
+    {
+        // needed for sorting
+        Stack tmp, safe;
+        // Result already initialized with first number
+        counter = 1;
+        stackInit(&tmp, (stackSize(s) > 0 ? stackSize(s) : 1));
+        stackInit(&safe, (stackSize(s) > 0 ? stackSize(s) : 1));
+        // add first value to the later sorted stack
+        stackPush(&tmp, input);
+        while (stackSize(s) > 0  && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            num = buildNumber(input);
+            // save all numbers larger as the stack value
+            while (stackSize(&tmp) > 0 && buildNumber(stackTop(&tmp)) < num)
+            {
+                stackPush(&safe, stackPop(&tmp));
+            }
+            // push value on the sorted stack
+            stackPush(&tmp, input);
+            // push all saved numbers back on the sorted stack
+            while (stackSize(&safe) > 0)
+            {
+                stackPush(&tmp, stackPop(&safe));
+            }
+            counter++;
+        }
+        stackFree(&safe);
+        // calculate the median index
+        counter = (number)(((int)counter+1)/2);
+        // pop all numbers until median index
+        while (counter > 1)
+        {
+            stackPop(&tmp);
+            counter--;
+        }
+        result = buildNumber(stackPop(&tmp));
+        // pop the remaining sorted stack
+        while (stackSize(&tmp) > 0)
+        {
+            stackPop(&tmp);
+        }
+        stackFree(&tmp);
+    }
+    else if(strncmp(function, "var", 3) == 0)
+    {
+        Stack tmp;
+        counter = 1;
+        // second stack to store values during calculation of mean
+        stackInit(&tmp, (stackSize(s) > 0 ? stackSize(s) : 1));
+        // push first value to temporary stack
+        stackPush(&tmp, input);
+        number mean = result;
+        while (stackSize(s) > 0  && strcmp(stackTop(s), FUNCTIONSEPARATOR) != 0)
+        {
+            input = (token)stackPop(s);
+            // push value to temporary stack
+            stackPush(&tmp, input);
+            num = buildNumber(input);
+            mean += num;
+            counter++;
+        }
+        // calculate mean
+        mean /= counter;
+        result = 0;
+        // calculate sum of squared differences
+        while (stackSize(&tmp) > 0)
+        {
+            input = (token)stackPop(&tmp);
+            num = buildNumber(input)-mean;
+            result += pow(num,2);
+        }
+        // determine variance
+        result /= counter;
+        stackFree(&tmp);
+    }
+    if (strcmp(stackTop(s), FUNCTIONSEPARATOR) == 0)
+        stackPop(s);
+    stackPush(s, num2Str(result));
+    return 0;
+}
+
+int doOp(Stack *s, token op)
+{
+    int err = 0;
+    token roperand = (token)stackPop(s);
+    token loperand = (token)stackPop(s);
+    number lside = buildNumber(loperand);
+    number rside = buildNumber(roperand);
+    number ret;
+    switch(*op)
+    {
+        case '^':
+            {
+                ret = pow(lside, rside);
+            }
+            break;
+        case '*':
+            {
+                ret = lside * rside;
+            }
+            break;
+        case '/':
+            {
+                if(rside == 0)
+                {
+                    raise(divZero);
+                    if (lside == 0)
+                        ret = NAN;
+                    else
+                        ret = INFINITY;
+                    err = -1;
+                }
+                else
+                    ret = lside / rside;
+            }
+            break;
+        case '%':
+            {
+                if(rside == 0)
+                {
+                    raise(divZero);
+                    if (lside == 0)
+                        ret = NAN;
+                    else
+                        ret = INFINITY;
+                    err = -1;
+                }
+                else
+                {
+                    ret = (int)(lside / rside);
+                    ret = lside - (ret * rside);
+                }
+            }
+            break;
+        case '+':
+            {
+                ret = lside + rside;
+            }
+            break;
+        case '-':
+            {
+                ret = lside - rside;
+            }
+            break;
+    }
+    stackPush(s, num2Str(ret));
+    return err;
+}
+
+Symbol type(char ch)
+{
+    Symbol result;
+    switch(ch)
+    {
+        case '+':
+        case '-':
+            result = addop;
+            break;
+        case '*':
+        case '/':
+        case '%':
+            result = multop;
+            break;
+        case '^':
+            result = expop;
+            break;
+        case '(':
+            result = lparen;
+            break;
+        case ')':
+            result = rparen;
+            break;
+        case '.':
+            result = decimal;
+            break;
+        case ' ':
+            result = space;
+            break;
+        case ',':
+            result = argsep;
+            break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+            result = digit;
+            break;
+        case 'A':
+        case 'B':
+        case 'C':
+        case 'D':
+        case 'E':
+        case 'F':
+        case 'G':
+        case 'H':
+        case 'I':
+        case 'J':
+        case 'K':
+        case 'L':
+        case 'M':
+        case 'N':
+        case 'O':
+        case 'P':
+        case 'Q':
+        case 'R':
+        case 'S':
+        case 'T':
+        case 'U':
+        case 'V':
+        case 'W':
+        case 'X':
+        case 'Y':
+        case 'Z':
+        case 'a':
+        case 'b':
+        case 'c':
+        case 'd':
+        case 'e':
+        case 'f':
+        case 'g':
+        case 'h':
+        case 'i':
+        case 'j':
+        case 'k':
+        case 'l':
+        case 'm':
+        case 'n':
+        case 'o':
+        case 'p':
+        case 'q':
+        case 'r':
+        case 's':
+        case 't':
+        case 'u':
+        case 'v':
+        case 'w':
+        case 'x':
+        case 'y':
+        case 'z':
+            result = text;
+            break;
+        default:
+            result = invalid;
+            break;
+    }
+    return result;
+}
+
+bool isFunction(token tk)
+{
+    return (strncmp(tk, "abs", 3) == 0
+        || strncmp(tk, "floor", 5) == 0
+        || strncmp(tk, "ceil", 4) == 0
+        || strncmp(tk, "sin", 3) == 0
+        || strncmp(tk, "cos", 3) == 0
+        || strncmp(tk, "tan", 3) == 0
+        || strncmp(tk, "arcsin", 6) == 0
+        || strncmp(tk, "arccos", 6) == 0
+        || strncmp(tk, "arctan", 6) == 0
+        || strncmp(tk, "asin", 4) == 0
+        || strncmp(tk, "acos", 4) == 0
+        || strncmp(tk, "atan", 4) == 0
+        || strncmp(tk, "sqrt", 4) == 0
+        || strncmp(tk, "cbrt", 4) == 0
+        || strncmp(tk, "log", 3) == 0
+        || strncmp(tk, "min", 3) == 0
+        || strncmp(tk, "max", 3) == 0
+        || strncmp(tk, "sum", 3) == 0
+        || strncmp(tk, "avg", 3) == 0
+        || strncmp(tk, "mean", 4) == 0
+        || strncmp(tk, "median", 6) == 0
+        || strncmp(tk, "var", 3) == 0
+        || strncmp(tk, "exp", 3) == 0);
+}
+
+bool isSpecialValue(token tk)
+{
+    return (strncmp(tk, "nan", 3) == 0 || strncmp(tk, "inf", 3) == 0);
+}
+
+Symbol tokenType(token tk)
+{
+    if (!tk)
+        return invalid;
+    Symbol ret = type(*tk);
+    switch(ret)
+    {
+        case text:
+            if(isFunction(tk))
+                ret = function;
+            else if(isSpecialValue(tk))
+                ret = value;
+            else
+                ret = identifier;
+            break;
+        case addop:
+            if(*tk == '-' && strlen(tk) > 1)
+                ret = tokenType(tk+1);
+            break;
+        case decimal:
+        case digit:
+            ret = value;
+            break;
+        default:
+            break;
+    }
+    return ret;
+}
+
+int tokenize(char *str, char *(**tokensRef))
+{
+    int i = 0;
+    char** tokens = NULL;
+    char** tmp = NULL;
+    char* ptr = str;
+    char ch = '\0';
+    int numTokens = 0;
+    char* tmpToken = malloc((prefs.maxtokenlength+1) * sizeof(char));
+    if (!tmpToken)
+    {
+        fprintf(stderr, "Malloc of temporary buffer failed\n");
+        return 0;
+    }
+    while((ch = *ptr++))
+    {
+        if(type(ch) == invalid) // Stop tokenizing when we encounter an invalid character
+            break;
+
+        token newToken = NULL;
+        tmpToken[0] = '\0';
+        switch(type(ch))
+        {
+            case addop:
+                {
+                    // Check if this is a negative
+                    if(ch == '-'
+                        && (numTokens == 0
+                            || (tokenType(tokens[numTokens-1]) == addop
+                                || tokenType(tokens[numTokens-1]) == multop
+                                || tokenType(tokens[numTokens-1]) == expop
+                                || tokenType(tokens[numTokens-1]) == lparen
+                                || tokenType(tokens[numTokens-1]) == argsep)))
+                    {
+                        // Assemble an n-character (plus null-terminator) number token
+                        {
+                            int len = 1;
+                            bool hasDecimal = false;
+                            bool hasExponent = false;
+
+                            if(type(ch) == decimal) // Allow numbers to start with decimal
+                            {
+                                //printf("Decimal\n");
+                                hasDecimal = true;
+                                len++;
+                                tmpToken[0] = '0';
+                                tmpToken[1] = '.';
+                            }
+                            else // Numbers that do not start with decimal
+                            {
+                                tmpToken[len-1] = ch;
+                            }
+
+                            // Assemble rest of number
+                            for(; // Don't change len
+                                *ptr // There is a next character and it is not null
+                                && len <= prefs.maxtokenlength
+                                && (type(*ptr) == digit // The next character is a digit
+                                     || ((type(*ptr) == decimal // Or the next character is a decimal
+                                         && hasDecimal == 0)) // But we have not added a decimal
+                                     || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+                                         && hasExponent == false) // But we have not added an exponent yet
+                                || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+                                ++len)
+                            {
+                                if(type(*ptr) == decimal)
+                                    hasDecimal = true;
+                                else if(*ptr == 'E' || *ptr == 'e')
+                                    hasExponent = true;
+                                tmpToken[len] = *ptr++;
+                            }
+
+                            // Append null-terminator
+                            tmpToken[len] = '\0';
+                        }
+                        break;
+                    }
+                    // If it's not part of a number, it's an op - fall through
+                }
+            case multop:
+            case expop:
+            case lparen:
+            case rparen:
+            case argsep:
+                // Assemble a single-character (plus null-terminator) operation token
+                {
+                    tmpToken[0] = ch;
+                    tmpToken[1] = '\0';
+                }
+                break;
+            case digit:
+            case decimal:
+                // Assemble an n-character (plus null-terminator) number token
+                {
+                    int len = 1;
+                    bool hasDecimal = false;
+                    bool hasExponent = false;
+
+                    if(type(ch) == decimal) // Allow numbers to start with decimal
+                    {
+                        //printf("Decimal\n");
+                        hasDecimal = true;
+                        len++;
+                        tmpToken[0] = '0';
+                        tmpToken[1] = '.';
+                    }
+                    else // Numbers that do not start with decimal
+                    {
+                        tmpToken[len-1] = ch;
+                    }
+
+                    // Assemble rest of number
+                    for(; // Don't change len
+                        *ptr // There is a next character and it is not null
+                        && len <= prefs.maxtokenlength
+                        && (type(*ptr) == digit // The next character is a digit
+                             || ((type(*ptr) == decimal // Or the next character is a decimal
+                                 && hasDecimal == 0)) // But we have not added a decimal
+                             || ((*ptr == 'E' || *ptr == 'e') // Or the next character is an exponent
+                                 && hasExponent == false) // But we have not added an exponent yet
+                             || ((*ptr == '+' || *ptr == '-') && hasExponent == true)); // Exponent with sign
+                        ++len)
+                    {
+                        if(type(*ptr) == decimal)
+                            hasDecimal = true;
+                        else if(*ptr == 'E' || *ptr == 'e')
+                            hasExponent = true;
+                        tmpToken[len] = *ptr++;
+                    }
+
+                    // Append null-terminator
+                    tmpToken[len] = '\0';
+                }
+                break;
+            case text:
+                // Assemble an n-character (plus null-terminator) text token
+                {
+                    int len = 1;
+                    tmpToken[0] = ch;
+                    for(len = 1; *ptr && type(*ptr) == text && len <= prefs.maxtokenlength; ++len)
+                    {
+                        tmpToken[len] = *ptr++;
+                    }
+                    tmpToken[len] = '\0';
+                }
+                break;
+            default:
+                break;
+        }
+        // Add to list of tokens
+        if(tmpToken[0] != '\0' && strlen(tmpToken) > 0)
+        {
+            numTokens++;
+            /*if(tokens == NULL) // First allocation
+                tokens = (char**)malloc(numTokens * sizeof(char*));
+            else*/
+
+            newToken = malloc((strlen(tmpToken)+1) * sizeof(char));
+            if (!newToken)
+            {
+                numTokens--;
+                break;
+            }
+            strcpy(newToken, tmpToken);
+            newToken[strlen(tmpToken)] = '\0';
+            tmp = (char**)realloc(tokens, numTokens * sizeof(char*));
+            if (tmp == NULL)
+            {
+                if (tokens != NULL)
+                {
+                    for(i=0;i<numTokens-1;i++)
+                    {
+                        if (tokens[i] != NULL)
+                        {
+                            free(tokens[i]);
+                            tokens[i] = NULL;
+                        }
+                    }
+                    free(tokens);
+                    tokens = NULL;
+                }
+                *tokensRef = NULL;
+                free(newToken);
+                free(tmpToken);
+                return 0;
+            }
+            tokens = tmp;
+            tmp = NULL;
+            tokens[numTokens - 1] = newToken;
+        }
+    }
+    *tokensRef = tokens; // Send back out
+    free(tmpToken);
+    tmpToken = NULL;
+    return numTokens;
+}
+
+bool leftAssoc(token op)
+{
+    bool ret = false;
+    switch(tokenType(op))
+    {
+        case addop:
+        case multop:
+
+            ret = true;
+            break;
+        case function:
+        case expop:
+            ret = false;
+            break;
+        default:
+            break;
+    }
+    return ret;
+}
+
+int precedence(token op1, token op2)
+{
+    int ret = 0;
+
+    if (op2 == NULL)
+        ret = 1;
+    else if(tokenType(op1) == tokenType(op2)) // Equal precedence
+        ret = 0;
+    else if(tokenType(op1) == addop
+            && (tokenType(op2) == multop || tokenType(op2) == expop)) // op1 has lower precedence
+        ret = -1;
+    else if(tokenType(op2) == addop
+            && (tokenType(op1) == multop || tokenType(op1) == expop)) // op1 has higher precedence
+        ret = 1;
+    else if(tokenType(op1) == multop
+            && tokenType(op2) == expop) // op1 has lower precedence
+        ret = -1;
+    else if(tokenType(op1) == expop
+            && tokenType(op2) == multop) // op1 has higher precedence
+        ret = 1;
+    else if (tokenType(op1) == function
+            && (tokenType(op2) == addop || tokenType(op2) == multop || tokenType(op2) == expop || tokenType(op2) == lparen))
+        ret = 1;
+    else if ((tokenType(op1) == addop || tokenType(op1) == multop || tokenType(op1) == expop)
+            && tokenType(op2) == function)
+        ret = -1;
+    return ret;
+}
+
+void evalStackPush(Stack *s, token val)
+{
+    if(prefs.display.postfix)
+        printf("\t%s\n", val);
+
+    switch(tokenType(val))
+    {
+        case function:
+            {
+                if (doFunc(s, val) < 0)
+                    return;
+            }
+            break;
+        case expop:
+        case multop:
+        case addop:
+            {
+                if(stackSize(s) >= 2)
+                {
+                    // Evaluate
+                    if (doOp(s, val) < 0)
+                        return;
+                }
+                else
+                {
+                    stackPush(s, val);
+                }
+            }
+            break;
+        case value:
+            {
+                stackPush(s, val);
+            }
+            break;
+        default:
+            break;
+    }
+}
+
+bool postfix(token *tokens, int numTokens, Stack *output)
+{
+    Stack operators, intermediate;
+    int i;
+    bool err = false;
+    stackInit(&operators, numTokens);
+    stackInit(&intermediate, numTokens);
+    for(i = 0; i < numTokens; i++)
+    {
+        // From Wikipedia/Shunting-yard_algorithm:
+        switch(tokenType(tokens[i]))
+        {
+            case value:
+                {
+                    // If the token is a number, then add it to the output queue.
+                    //printf("Adding number %s to output stack\n", tokens[i]);
+                    evalStackPush(output, tokens[i]);
+                }
+                break;
+            case function:
+                {
+                    while(stackSize(&operators) > 0
+                        && (tokenType(tokens[i]) != lparen)
+                        && ((precedence(tokens[i], (char*)stackTop(&operators)) <= 0)))
+                    {
+                        //printf("Moving operator %s from operator stack to output stack\n", (char*)stackTop(&operators));
+                        evalStackPush(output, stackPop(&operators));
+                        stackPush(&intermediate, stackTop(output));
+                    }
+
+                    // If the token is a function token, then push it onto the stack.
+                    //printf("Adding operator %s to operator stack\n", tokens[i]);
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case argsep:
+                {
+                    /*
+                     * If the token is a function argument separator (e.g., a comma):
+                     *     Until the token at the top of the stack is a left
+                     *     paren, pop operators off the stack onto the output
+                     *     queue. If no left paren encountered, either separator
+                     *     was misplaced or parens mismatched.
+                     */
+                    while(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen
+                        && stackSize(&operators) > 1)
+                    {
+                        //printf("Moving operator from operator stack to output stack\n");
+                        evalStackPush(output, stackPop(&operators));
+                        stackPush(&intermediate, stackTop(output));
+                    }
+                }
+                break;
+            case addop:
+            case multop:
+            case expop:
+                {
+                    /*
+                     * If the token is an operator, op1, then:
+                     *     while there is an operator token, op2, at the top of the stack, and
+                     *             either op1 is left-associative and its precedence is less than or equal to that of op2,
+                     *             or op1 is right-associative and its precedence is less than that of op2,
+                     *         pop op2 off the stack, onto the output queue
+                     *     push op1 onto the stack
+                     */
+                    while(stackSize(&operators) > 0
+                        && (tokenType((char*)stackTop(&operators)) == addop || tokenType((char*)stackTop(&operators)) == multop || tokenType((char*)stackTop(&operators)) == expop)
+                        && ((leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) <= 0)
+                            || (!leftAssoc(tokens[i]) && precedence(tokens[i], (char*)stackTop(&operators)) < 0)))
+                    {
+                        //printf("Moving operator %s from operator stack to output stack\n", (char*)stackTop(&operators));
+                        evalStackPush(output, stackPop(&operators));
+                        stackPush(&intermediate, stackTop(output));
+                    }
+                    //printf("Adding operator %s to operator stack\n", tokens[i]);
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case lparen:
+                {
+                    // If the token is a left paren, then push it onto the stack
+                    //printf("Adding left paren to operator stack\n");
+                    if (tokenType(stackTop(&operators)) == function)
+                        stackPush(output, FUNCTIONSEPARATOR);
+                    stackPush(&operators, tokens[i]);
+                }
+                break;
+            case rparen:
+                {
+                    /*
+                     * If the token is a right paren:
+                     *     Until the token at the top of the stack is a left paren, pop operators off the stack onto the output queue
+                     *     Pop the left paren from the stack, but not onto the output queue
+                     *     If the stack runs out without finding a left paren, then there are mismatched parens
+                     */
+                    while(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen
+                        && stackSize(&operators) > 1)
+                    {
+                        //printf("Moving operator %s from operator stack to output stack\n", (char*)stackTop(&operators));
+                        evalStackPush(output, stackPop(&operators));
+                        stackPush(&intermediate, stackTop(output));
+                    }
+                    if(stackSize(&operators) > 0
+                        && tokenType((token)stackTop(&operators)) != lparen)
+                    {
+                        err = true;
+                        raise(parenMismatch);
+                    }
+                    //printf("Removing left paren from operator stack\n");
+                    stackPop(&operators); // Discard lparen
+                    while (stackSize(&operators) > 0 && tokenType((token)stackTop(&operators)) == function)
+                    {
+                        //printf("Removing function from operator stack to output stack\n");
+                        evalStackPush(output, stackPop(&operators));
+                        stackPush(&intermediate, stackTop(output));
+                    }
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    /*
+     * When there are no more tokens to read:
+     *     While there are still operator tokens on the stack:
+     *         If the operator token on the top of the stack is a paren, then there are mismatched parens
+     *         Pop the operator onto the output queue
+     */
+    while(stackSize(&operators) > 0)
+    {
+        if(tokenType((token)stackTop(&operators)) == lparen)
+        {
+            raise(parenMismatch);
+            err = true;
+        }
+        //printf("Moving operator from operator stack to output stack\n");
+        evalStackPush(output, stackPop(&operators));
+        stackPush(&intermediate, stackTop(output));
+    }
+    // pop result from intermediate stack
+    token r = stackPop(&intermediate);
+    // free remaining intermediate results
+    while (stackSize(&intermediate) > 0)
+    {
+        token s = stackPop(&intermediate);
+        free(s);
+    }
+    if (err == true)
+    {
+        while (stackSize(&operators) > 0)
+        {
+            token s = stackPop(&operators);
+            //printf("Freeing %s from operators stack\n", s);
+            free(s);
+        }
+    }
+    stackFree(&intermediate);
+    stackFree(&operators);
+    return err;
+}
+/* Added by Thomas Roehl (Thomas.Roehl@fau.de) as interface for LIKWID */
+int
+calculate_infix(char* finfix, double *result)
+{
+    int i;
+    int ret = 0;
+    *result = 0;
+    token* tokens = NULL;
+    Stack expr;
+    prefs.maxtokenlength = MAXTOKENLENGTH;
+    prefs.precision = MAXPRECISION;
+    int numTokens = tokenize(finfix, &tokens);
+    if (numTokens == 1)
+    {
+        if (tokenType(tokens[0]) == value)
+        {
+            *result = strtod((char*)tokens[0], NULL);
+        }
+        else
+        {
+            *result = NAN;
+        }
+        goto freeTokens;
+    }
+    stackInit(&expr, numTokens);
+    ret = postfix(tokens, numTokens, &expr);
+    if ((stackSize(&expr) != 1) || (ret == true))
+    {
+        *result = NAN;
+        ret = -1;
+        goto calcerror;
+    }
+    else
+    {
+        for (i=0; i< numTokens; i++)
+        {
+            if (tokens[i] == stackTop(&expr))
+                tokens[i] = NULL;
+        }
+        token r = stackPop(&expr);
+        *result = strtod((char*)r, NULL);
+        free(r);
+    }
+    ret = 0;
+calcerror:
+    stackFree(&expr);
+freeTokens:
+    for (i=0;i<numTokens;i++)
+    {
+        if (tokens[i])
+        {
+            free(tokens[i]);
+            tokens[i] = NULL;
+        }
+    }
+    if (tokens)
+    {
+        free(tokens);
+        tokens = NULL;
+        numTokens = 0;
+    }
+
+    return ret;
+}
diff --git a/src/calculator_stack.c b/src/calculator_stack.c
new file mode 100644
index 000000000..f8f35ca70
--- /dev/null
+++ b/src/calculator_stack.c
@@ -0,0 +1,81 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator_stack.c
+ *
+ *      Description:  Stack implementation for infix calculator
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) Brandon Mills
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * =======================================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <calculator_stack.h>
+
+void stackInit(Stack *s, int size)
+{
+    s->content = malloc(size * sizeof(void*));
+    s->size = size;
+    s->top = -1;
+}
+
+void stackPush(Stack *s, void* val)
+{
+    (s->top)++;
+    s->content[s->top] = val;
+}
+
+void* stackTop(Stack *s)
+{
+    void *ret = NULL;
+    if(s->top >= 0 && s->content != NULL)
+        ret = s->content[s->top];
+    return ret;
+}
+
+void* stackPop(Stack *s)
+{
+    void *ret = NULL;
+    if(s->top >= 0 && s->content != NULL)
+        ret = s->content[(s->top)--];
+    return ret;
+}
+
+int stackSize(Stack *s)
+{
+    return s->top + 1;
+}
+
+void stackFree(Stack *s)
+{
+    if (s->content)
+        free(s->content);
+    s->content = NULL;
+    s->size = 0;
+    s->top = -1;
+}
+
diff --git a/src/frequency.c b/src/frequency.c
deleted file mode 100644
index 90e8bf0ce..000000000
--- a/src/frequency.c
+++ /dev/null
@@ -1,1151 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  frequency.c
- *
- *      Description:  Module implementing an interface for frequency manipulation
- *
- *      Version:   <VERSION>
- *      Released:  <DATE>
- *
- *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
- *                Jan Treibig (jt), jan.treibig@gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <math.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include <bstrlib.h>
-#include <likwid.h>
-#include <types.h>
-#include <error.h>
-#include <topology.h>
-#include <access.h>
-#include <registers.h>
-#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
-#include <cpuid.h>
-#endif
-#include <lock.h>
-#include <frequency.h>
-#include <frequency_acpi.h>
-#include <frequency_pstate.h>
-
-char* daemon_path = TOSTRING(INSTALL_PREFIX) "/sbin/likwid-setFreq";
-
-
-typedef enum  {
-    NOT_DETECTED = 0,
-    ACPICPUFREQ,
-    INTELPSTATE,
-    PPCCPUFREQ,
-} likwid_freq_driver;
-
-likwid_freq_driver drv = NOT_DETECTED;
-
-static int freq_getDriver(const int cpu_id )
-{
-    FILE *f = NULL;
-    char buff[256];
-    char* rptr = NULL;
-    bstring bbuff;
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL)
-    {
-        //fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return -errno;
-    }
-    rptr = fgets(buff, 256, f);
-    if (rptr != NULL)
-    {
-        bbuff = bfromcstr(buff);
-        btrimws(bbuff);
-        if (strncmp(bdata(bbuff), "acpi-cpufreq", blength(bbuff)) == 0)
-        {
-            drv = ACPICPUFREQ;
-        }
-        else if (strncmp(bdata(bbuff), "intel_pstate", blength(bbuff)) == 0)
-        {
-            drv = INTELPSTATE;
-        }
-        bdestroy(bbuff);
-    }
-    fclose(f);
-    if (access(daemon_path, X_OK) != 0)
-    {
-        fprintf(stderr, "WARN: SetFreq daemon not found, cannot change settings\n");
-        drv = NOT_DETECTED;
-    }
-    return 0;
-}
-
-static int _freq_getUncoreMinMax(const int socket_id, int *cpuId, double* min, double* max)
-{
-    int cpu = -1;
-    *cpuId = -1;
-    *min = 0;
-    *max = 0;
-    for (int i=0; i<cpuid_topology.numHWThreads; i++)
-    {
-        if (cpuid_topology.threadPool[i].packageId == socket_id)
-        {
-            cpu = cpuid_topology.threadPool[i].apicId;
-            break;
-        }
-    }
-    if (cpu < 0)
-    {
-        fprintf(stderr, "Unknown socket ID %d\n", socket_id);
-        return -ENODEV;
-    }
-
-    char* avail = freq_getAvailFreq(cpu);
-    if (!avail)
-    {
-        fprintf(stderr, "Failed to get available CPU frequencies\n");
-        return -EINVAL;
-    }
-
-    double dmin = 0.0;
-    double dmax = 0.0;
-    bstring bavail = bfromcstr(avail);
-    free(avail);
-    struct bstrList* bavail_list;
-    bavail_list = bsplit(bavail, ' ');
-    bdestroy(bavail);
-    if (bavail_list->qty < 2)
-    {
-        fprintf(stderr, "Failed to read minimal and maximal frequencies\n");
-        bstrListDestroy(bavail_list);
-        return -EINVAL;
-    }
-    if (blength(bavail_list->entry[0]) > 0)
-    {
-        char* tptr = NULL;
-        dmin = strtod(bdata(bavail_list->entry[0]), &tptr);
-        if (bdata(bavail_list->entry[0]) != tptr)
-        {
-            dmin *= 1000;
-        }
-        else
-        {
-            fprintf(stderr, "Problem converting %s to double for comparison with given freq.\n", bdata(bavail_list->entry[0]));
-            return -EINVAL;
-        }
-    }
-    if (blength(bavail_list->entry[bavail_list->qty-1]) > 0)
-    {
-        char* tptr = NULL;
-        dmax = strtod(bdata(bavail_list->entry[bavail_list->qty-1]), &tptr);
-        if (bdata(bavail_list->entry[bavail_list->qty-1]) != tptr)
-        {
-            dmax *= 1000;
-        }
-        else
-        {
-            fprintf(stderr, "Problem converting %s to double for comparison with given freq.\n", bdata(bavail_list->entry[bavail_list->qty-1]));
-            return -EINVAL;
-        }
-    }
-    bstrListDestroy(bavail_list);
-
-    *cpuId = cpu;
-    if (dmin < dmax)
-    {
-        *min = dmin;
-        *max = dmax;
-    }
-    else
-    {
-        *max = dmin;
-        *min = dmax;
-    }
-
-    power_init(cpu);
-    if (power_info.turbo.numSteps > 0)
-    {
-        if (power_info.turbo.steps[0] > *max)
-        {
-            *max = power_info.turbo.steps[0];
-        }
-    }
-
-    return 0;
-}
-
-uint64_t freq_getCpuClockMax(const int cpu_id )
-{
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-    }
-    if (drv == ACPICPUFREQ)
-    {
-        return freq_acpi_getCpuClockMax(cpu_id);
-    }
-    else if (drv == INTELPSTATE)
-    {
-        return freq_pstate_getCpuClockMax(cpu_id);
-    }
-    return 0;
-}
-
-uint64_t freq_getCpuClockMin(const int cpu_id )
-{
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-    }
-    if (drv == ACPICPUFREQ)
-    {
-        return freq_acpi_getCpuClockMin(cpu_id);
-    }
-    else if (drv == INTELPSTATE)
-    {
-        return freq_pstate_getCpuClockMin(cpu_id);
-    }
-    return 0;
-}
-
-
-uint64_t freq_getCpuClockCurrent(const int cpu_id )
-{
-    FILE *f = NULL;
-    char cmd[256];
-    char buff[256];
-    char* eptr = NULL;
-    uint64_t clock = 0x0ULL;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return 0;
-        }
-    }
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL) {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    eptr = fgets(cmd, 256, f);
-    if (eptr != NULL)
-    {
-        clock = strtoull(cmd, NULL, 10);
-    }
-    fclose(f);
-    return clock * 1E3;
-}
-
-
-uint64_t freq_setCpuClockMax(const int cpu_id, const uint64_t freq)
-{
-    FILE *fpipe = NULL;
-    char cmd[256];
-    char buff[256];
-    uint64_t cur = 0x0ULL;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return 0;
-        }
-    }
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    cur = freq_getCpuClockMax(cpu_id);
-    if (cur == freq)
-    {
-        return cur;
-    }
-
-    sprintf(buff, "%s", daemon_path);
-    if (access(buff, X_OK))
-    {
-        fprintf(stderr, "Daemon %s not executable", buff);
-        return 0;
-    }
-
-    if (drv == ACPICPUFREQ)
-    {
-        sprintf(cmd, "%s %d max %lu", daemon_path, cpu_id, freq);
-    }
-    else if (drv == INTELPSTATE)
-    {
-        double f = (double)freq;
-        sprintf(cmd, "%s %d max %g", daemon_path, cpu_id, f/1000000);
-    }
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        fprintf(stderr, "Problems setting cpu frequency of CPU %d", cpu_id);
-        return 0;
-    }
-    if (pclose(fpipe))
-        return 0;
-
-    return freq;
-}
-
-/*uint64_t freq_getCpuClockMin(const int cpu_id )*/
-/*{*/
-
-/*    uint64_t clock = 0x0ULL;*/
-/*    FILE *f = NULL;*/
-/*    char cmd[256];*/
-/*    char buff[256];*/
-/*    char* eptr = NULL;*/
-
-/*    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_min_freq", cpu_id);*/
-/*    f = fopen(buff, "r");*/
-/*    if (f == NULL) {*/
-/*        fprintf(stderr, "Unable to open path %s for reading\n", buff);*/
-/*        return 0;*/
-/*    }*/
-/*    eptr = fgets(cmd, 256, f);*/
-/*    if (eptr != NULL)*/
-/*    {*/
-/*        clock = strtoull(cmd, NULL, 10);*/
-/*    }*/
-/*    fclose(f);*/
-/*    return clock *1E3;*/
-/*}*/
-
-uint64_t freq_setCpuClockMin(const int cpu_id, const uint64_t freq)
-{
-    FILE *fpipe = NULL;
-    char cmd[256];
-    char buff[256];
-    uint64_t cur = 0x0ULL;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return 0;
-        }
-    }
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    cur = freq_getCpuClockMin(cpu_id);
-    if (cur == freq)
-    {
-        return cur;
-    }
-
-    sprintf(buff, "%s", daemon_path);
-    if (access(buff, X_OK))
-    {
-        fprintf(stderr, "Daemon %s not executable", buff);
-        return 0;
-    }
-
-    //sprintf(cmd, "%s %d min %lu", daemon_path, cpu_id, freq);
-    if (drv == ACPICPUFREQ)
-    {
-        sprintf(cmd, "%s %d min %lu", daemon_path, cpu_id, freq);
-    }
-    else if (drv == INTELPSTATE)
-    {
-        double f = (double)freq;
-        sprintf(cmd, "%s %d min %g", daemon_path, cpu_id, f/1000000);
-    }
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        fprintf(stderr, "Problems setting cpu frequency of CPU %d", cpu_id);
-        return 0;
-    }
-    if (pclose(fpipe))
-        return 0;
-
-    return freq;
-}
-
-char * freq_getGovernor(const int cpu_id )
-{
-    FILE *f = NULL;
-    char cmd[256];
-    char buff[256];
-    char* eptr = NULL, *sptr = NULL;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return NULL;
-        }
-    }
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL) {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return NULL;
-    }
-    eptr = fgets(cmd, 256, f);
-    if (eptr != NULL)
-    {
-        bstring bbuff = bfromcstr(cmd);
-        btrimws(bbuff);
-        eptr = NULL;
-        eptr = malloc((blength(bbuff)+1) * sizeof(char));
-        if (eptr == NULL)
-        {
-            return NULL;
-        }
-        sptr = bdata(bbuff);
-        strcpy(eptr, sptr);
-        return eptr;
-    }
-    return NULL;
-}
-
-/*int freq_setTurbo(const int cpu_id, int turbo)
-{
-    FILE *fpipe = NULL;
-    char cmd[256];
-
-    sprintf(cmd, "%s %d tur %d", daemon_path, cpu_id, turbo);
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        fprintf(stderr, "Problems setting turbo mode of CPU %d", cpu_id);
-        return 0;
-    }
-    if (pclose(fpipe))
-        return 0;
-    return 1;
-}*/
-
-static int getAMDTurbo(const int cpu_id)
-{
-    int err = 0;
-    int own_hpm = 0;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-
-        err = HPMaddThread(cpu_id);
-        if (err != 0)
-        {
-            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-            return err;
-        }
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpu_id, MSR_DEV, 0xC0010015, &tmp);
-    if (err)
-    {
-        ERROR_PLAIN_PRINT(Cannot read register 0xC0010015);
-        return err;
-    }
-    if (own_hpm)
-        HPMfinalize();
-    err = ((tmp >> 25) & 0x1);
-    return err == 0;
-}
-
-static int setAMDTurbo(const int cpu_id, const int turbo)
-{
-    int err = 0;
-    int own_hpm = 0;
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return -EPERM;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-
-        err = HPMaddThread(cpu_id);
-        if (err != 0)
-        {
-            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-            return err;
-        }
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpu_id, MSR_DEV, 0xC0010015, &tmp);
-    if (err)
-    {
-        ERROR_PLAIN_PRINT(Cannot read register 0xC0010015);
-        return err;
-    }
-
-    if (turbo)
-    {
-        tmp &= ~(1ULL<<25);
-    }
-    else
-    {
-        tmp |= (1ULL << 25);
-    }
-    err = HPMwrite(cpu_id, MSR_DEV, 0xC0010015, tmp);
-    if (err)
-    {
-        ERROR_PLAIN_PRINT(Cannot write register 0xC0010015);
-        return err;
-    }
-
-    if (own_hpm)
-        HPMfinalize();
-    return err == 0;
-}
-
-static int getIntelTurbo(const int cpu_id)
-{
-    int err = 0;
-    int own_hpm = 0;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-        err = HPMaddThread(cpu_id);
-        if (err != 0)
-        {
-            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-            return err;
-        }
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, &tmp);
-    if (err)
-    {
-        ERROR_PRINT(Cannot read register 0x%x, MSR_IA32_MISC_ENABLE);
-        return err;
-    }
-    if (own_hpm)
-        HPMfinalize();
-    err = ((tmp >> 38) & 0x1);
-    return err == 0;
-}
-
-static int setIntelTurbo(const int cpu_id, const int turbo)
-{
-    int err = 0;
-    int own_hpm = 0;
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return -EPERM;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-
-        err = HPMaddThread(cpu_id);
-        if (err != 0)
-        {
-            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-            return err;
-        }
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, &tmp);
-    if (err)
-    {
-        ERROR_PRINT(Cannot read register 0x%x, MSR_IA32_MISC_ENABLE);
-        return err;
-    }
-    if (turbo)
-    {
-        tmp &= ~(1ULL << 38);
-    }
-    else
-    {
-        tmp |= (1ULL << 38);
-    }
-    err = HPMwrite(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, tmp);
-    if (err)
-    {
-        ERROR_PRINT(Cannot write register 0x%x, MSR_IA32_MISC_ENABLE);
-        return err;
-    }
-
-    if (own_hpm)
-        HPMfinalize();
-    return err == 0;
-}
-#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
-static int isAMD()
-{
-    unsigned int eax,ebx,ecx,edx;
-    eax = 0x0;
-    CPUID(eax,ebx,ecx,edx);
-    if (ecx == 0x444d4163)
-        return 1;
-    return 0;
-}
-#else
-static int isAMD()
-{
-    return 0;
-}
-#endif
-
-int freq_getTurbo(const int cpu_id)
-{
-    if (drv == ACPICPUFREQ)
-    {
-        if (isAMD())
-            return getAMDTurbo(cpu_id);
-        return getIntelTurbo(cpu_id);
-    }
-    else if (drv == INTELPSTATE)
-    {
-        return freq_pstate_getTurbo(cpu_id);
-    }
-    return -1;
-}
-
-int freq_setTurbo(const int cpu_id, const int turbo)
-{
-    FILE *fpipe = NULL;
-    char cmd[256];
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return 0;
-        }
-    }
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    sprintf(cmd, "%s %d tur %d", daemon_path, cpu_id, turbo);
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        fprintf(stderr, "Problems setting turbo mode of CPU %d", cpu_id);
-        return 0;
-    }
-    pclose(fpipe);
-    if (isAMD())
-        return setAMDTurbo(cpu_id, turbo);
-    else
-        return setIntelTurbo(cpu_id, turbo);
-    return 1;
-}
-
-int freq_setGovernor(const int cpu_id, const char* gov)
-{
-    FILE *fpipe = NULL;
-    char cmd[256];
-    char buff[256];
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return 0;
-        }
-    }
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    sprintf(buff, "%s", daemon_path);
-    if (access(buff, X_OK))
-    {
-        ERROR_PRINT(Daemon %s not executable, buff);
-        return 0;
-    }
-
-    sprintf(cmd, "%s %d gov %s", daemon_path, cpu_id, gov);
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        ERROR_PRINT(Problems setting cpu frequency of CPU %d, cpu_id);
-        return 0;
-    }
-    if (pclose(fpipe))
-        return 0;
-    return 1;
-}
-
-char * freq_getAvailFreq(const int cpu_id )
-{
-    int i, j, k;
-    FILE *fpipe = NULL;
-    char cmd[256];
-    char buff[2048];
-    char tmp[10];
-    char *eptr = NULL, *rptr = NULL, *sptr = NULL;
-    double d = 0;
-    int take_next = 0;
-    bstring bbuff;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return NULL;
-        }
-    }
-
-    sprintf(cmd, "%s 2>&1", daemon_path);
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        ERROR_PRINT(Problem executing %s, daemon_path);
-        return NULL;
-    }
-    while (fgets(buff, 2048, fpipe))
-    {
-        if (strncmp(buff, "Frequency steps:", 16) == 0)
-        {
-            //printf("Take next\n");
-            take_next = 1;
-            continue;
-        }
-        if (take_next)
-        {
-            int eidx = 0;
-            //printf("Take %s\n", buff);
-            eptr = malloc(strlen(buff) * sizeof(char));
-            sptr = strtok(buff, " ");
-            while (sptr != NULL)
-            {
-                d = atof(sptr);
-                if (d > 0)
-                {
-                    eidx += snprintf(&(eptr[eidx]), 19, "%g ", d*1E-6);
-                }
-                sptr = strtok(NULL, " ");
-            }
-            break;
-        }
-    }
-    if (pclose(fpipe) == -1)
-    {
-        return NULL;
-    }
-    for (int i=strlen(eptr)-1; i>= 0; i--)
-    {
-        if (eptr[i] == ' ')
-        {
-            eptr[i] = '\0';
-        }
-        else
-        {
-            break;
-        }
-    }
-    return eptr;
-}
-
-char * freq_getAvailGovs(const int cpu_id )
-{
-    int i, j, k;
-    FILE *fpipe = NULL;
-    char cmd[256];
-    char buff[2048];
-    char tmp[10];
-    char *eptr = NULL, *rptr = NULL, *sptr = NULL;
-    double d = 0;
-    int take_next = 0;
-    bstring bbuff;
-    if (drv == NOT_DETECTED)
-    {
-        freq_getDriver(cpu_id);
-        if (drv == NOT_DETECTED)
-        {
-            return NULL;
-        }
-    }
-
-    sprintf(cmd, "%s 2>&1", daemon_path);
-    if ( !(fpipe = (FILE*)popen(cmd,"r")) )
-    {  // If fpipe is NULL
-        ERROR_PRINT(Problem executing %s, daemon_path);
-        return NULL;
-    }
-    while (fgets(buff, 2048, fpipe))
-    {
-        if (strncmp(buff, "Governors:", 10) == 0)
-        {
-            take_next = 1;
-            continue;
-        }
-        if (take_next)
-        {
-            int eidx = 0;
-            eptr = malloc((strlen(buff)+1) * sizeof(char));
-            memset(eptr, 0, (strlen(buff)+1) * sizeof(char));
-            strncpy(eptr, buff, strlen(buff));
-            break;
-        }
-    }
-    if (pclose(fpipe) == -1)
-    {
-        return NULL;
-    }
-    for (int i=strlen(eptr)-1; i>= 0; i--)
-    {
-        if (eptr[i] == ' ')
-        {
-            eptr[i] = '\0';
-        }
-        else
-        {
-            break;
-        }
-    }
-    return eptr;
-}
-
-int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq)
-{
-    int err = 0;
-    int own_hpm = 0;
-    int cpuId = -1;
-    uint64_t f = freq / 100;
-    double fmin, fmax;
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return -EPERM;
-    }
-    if (isAMD())
-    {
-        return 0;
-    }
-    err = _freq_getUncoreMinMax(socket_id, &cpuId, &fmin, &fmax);
-    if (err < 0)
-    {
-        return err;
-    }
-    if (freq < (uint64_t)fmin)
-    {
-        ERROR_PRINT(Given frequency %llu MHz lower than system limit of %.0f MHz, freq, fmin);
-        return -EINVAL;
-    }
-    if (freq > (uint64_t)fmax)
-    {
-        ERROR_PRINT(Given frequency %llu MHz higher than system limit of %.0f MHz, freq, fmax);
-        return -EINVAL;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-    }
-    err = HPMaddThread(cpuId);
-    if (err != 0)
-    {
-        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-        return 0;
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
-    if (err)
-    {
-        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return err;
-    }
-    tmp &= ~(0xFF00);
-    tmp |= (f<<8);
-    err = HPMwrite(cpuId, MSR_DEV, MSR_UNCORE_FREQ, tmp);
-    if (err)
-    {
-        ERROR_PRINT(Cannot write register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return err;
-    }
-
-    if (own_hpm)
-        HPMfinalize();
-    return 0;
-}
-
-
-
-
-uint64_t freq_getUncoreFreqMin(const int socket_id)
-{
-    int err = 0;
-    int own_hpm = 0;
-    int cpuId = -1;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-    if (isAMD())
-    {
-        return 0;
-    }
-    for (int i=0; i<cpuid_topology.numHWThreads; i++)
-    {
-        if (cpuid_topology.threadPool[i].packageId == socket_id)
-        {
-            cpuId = cpuid_topology.threadPool[i].apicId;
-            break;
-        }
-    }
-    if (cpuId < 0)
-    {
-        ERROR_PRINT(Unknown socket ID %d, socket_id);
-        return 0;
-    }
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-    }
-    err = HPMaddThread(cpuId);
-    if (err != 0)
-    {
-        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-        return 0;
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
-    if (err)
-    {
-        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return 0;
-    }
-    tmp = ((tmp>>8) & 0xFFULL) * 100;
-
-    if (own_hpm)
-        HPMfinalize();
-    return tmp;
-}
-
-int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq)
-{
-    int err = 0;
-    int own_hpm = 0;
-    int cpuId = -1;
-    uint64_t f = freq / 100;
-    double fmin, fmax;
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return -EPERM;
-    }
-    if (isAMD())
-    {
-        return 0;
-    }
-    err = _freq_getUncoreMinMax(socket_id, &cpuId, &fmin, &fmax);
-    if (err < 0)
-    {
-        return err;
-    }
-    if (freq < (uint64_t)fmin)
-    {
-        ERROR_PRINT(Given frequency %llu MHz lower than system limit of %.0f MHz, freq, fmin);
-        return -EINVAL;
-    }
-    if (freq > (uint64_t)fmax)
-    {
-        ERROR_PRINT(Given frequency %llu MHz higher than system limit of %.0f MHz, freq, fmax);
-        return -EINVAL;
-    }
-
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-    }
-    err = HPMaddThread(cpuId);
-    if (err != 0)
-    {
-        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-        return 0;
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
-    if (err)
-    {
-        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return err;
-    }
-    tmp &= ~(0xFFULL);
-    tmp |= (f & 0xFFULL);
-    err = HPMwrite(cpuId, MSR_DEV, MSR_UNCORE_FREQ, tmp);
-    if (err)
-    {
-        ERROR_PRINT(Cannot write register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return err;
-    }
-
-    if (own_hpm)
-        HPMfinalize();
-    return 0;
-}
-
-uint64_t freq_getUncoreFreqMax(const int socket_id)
-{
-    int err = 0;
-    int own_hpm = 0;
-    int cpuId = -1;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-
-    if (isAMD())
-    {
-        return 0;
-    }
-    for (int i=0; i<cpuid_topology.numHWThreads; i++)
-    {
-        if (cpuid_topology.threadPool[i].packageId == socket_id)
-        {
-            cpuId = cpuid_topology.threadPool[i].apicId;
-            break;
-        }
-    }
-    if (cpuId < 0)
-    {
-        ERROR_PRINT(Unknown socket ID %d, socket_id);
-        return 0;
-    }
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-    }
-    err = HPMaddThread(cpuId);
-    if (err != 0)
-    {
-        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-        return 0;
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
-    if (err)
-    {
-        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
-        return 0;
-    }
-    tmp = (tmp & 0xFFULL) * 100;
-
-    if (own_hpm)
-        HPMfinalize();
-    return tmp;
-}
-
-uint64_t freq_getUncoreFreqCur(const int socket_id)
-{
-    int err = 0;
-    int own_hpm = 0;
-    int cpuId = -1;
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to frequency backend is locked.\n");
-        return 0;
-    }
-    if (isAMD())
-    {
-        return 0;
-    }
-    for (int i=0; i<cpuid_topology.numHWThreads; i++)
-    {
-        if (cpuid_topology.threadPool[i].packageId == socket_id)
-        {
-            cpuId = cpuid_topology.threadPool[i].apicId;
-            break;
-        }
-    }
-    if (cpuId < 0)
-    {
-        ERROR_PRINT(Unknown socket ID %d, socket_id);
-        return 0;
-    }
-    if (!HPMinitialized())
-    {
-        HPMinit();
-        own_hpm = 1;
-        err = HPMaddThread(cpuId);
-        if (err != 0)
-        {
-            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
-            return 0;
-        }
-    }
-
-    uint64_t tmp = 0x0ULL;
-    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ_READ, &tmp);
-    if (err)
-    {
-        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ_READ, cpuId);
-        return 0;
-    }
-    tmp = (tmp & 0xFFULL) * 100;
-
-    if (own_hpm)
-        HPMfinalize();
-    return tmp;
-}
diff --git a/src/frequency_acpi.c b/src/frequency_acpi.c
deleted file mode 100644
index 80e3e3ef5..000000000
--- a/src/frequency_acpi.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  frequency_acpi.c
- *
- *      Description:  Module implementing an interface for frequency manipulation, the
- *                    ACPI CPUFreq backend
- *
- *      Version:   <VERSION>
- *      Released:  <DATE>
- *
- *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
- *                Jan Treibig (jt), jan.treibig@gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <math.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include <bstrlib.h>
-#include <likwid.h>
-#include <types.h>
-#include <error.h>
-#include <topology.h>
-#include <access.h>
-#include <registers.h>
-
-#include <frequency_acpi.h>
-
-
-
-uint64_t freq_acpi_getCpuClockMax(const int cpu_id )
-{
-    FILE *f = NULL;
-    char cmd[256];
-    char buff[256];
-    char* eptr = NULL;
-    uint64_t clock = 0x0ULL;
-
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL) {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    eptr = fgets(cmd, 256, f);
-    if (eptr != NULL)
-    {
-        clock = strtoull(cmd, NULL, 10);
-    }
-    fclose(f);
-    return clock *1E3;
-}
-
-
-uint64_t freq_acpi_getCpuClockMin(const int cpu_id )
-{
-
-    uint64_t clock = 0x0ULL;
-    FILE *f = NULL;
-    char cmd[256];
-    char buff[256];
-    char* eptr = NULL;
-
-    sprintf(buff, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_min_freq", cpu_id);
-    f = fopen(buff, "r");
-    if (f == NULL) {
-        fprintf(stderr, "Unable to open path %s for reading\n", buff);
-        return 0;
-    }
-    eptr = fgets(cmd, 256, f);
-    if (eptr != NULL)
-    {
-        clock = strtoull(cmd, NULL, 10);
-    }
-    fclose(f);
-    return clock *1E3;
-}
-
-
-
diff --git a/src/frequency_cpu.c b/src/frequency_cpu.c
new file mode 100644
index 000000000..d99ac3c63
--- /dev/null
+++ b/src/frequency_cpu.c
@@ -0,0 +1,1134 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  frequency.c
+ *
+ *      Description:  Module implementing an interface for frequency manipulation
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *                Jan Treibig (jt), jan.treibig@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include <unistd.h>
+#include <error.h>
+#include <string.h>
+
+#include <bstrlib.h>
+#include <likwid.h>
+#include <topology.h>
+#include <access.h>
+#include <registers.h>
+#include <lock.h>
+#include <configuration.h>
+
+#include <frequency.h>
+#include <frequency_client.h>
+
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
+#include <cpuid.h>
+#endif
+
+
+void (*freq_init_f)() = NULL;
+int (*freq_send)(FreqDataRecordType type, FreqDataRecordLocation loc, int cpu, int len, char* data) = NULL;
+void (*freq_finalize_f)() = NULL;
+static int freq_initialized = 0;
+static int own_hpm = 0;
+
+static struct cpufreq_files* cpufiles = NULL;
+
+static char* basefolder1 = "/sys/devices/system/cpu/cpu";
+static char* basefolder2 = "/cpufreq";
+
+static int fsocket = -1;
+
+struct cpufreq_files {
+    int  cur_freq;
+    int  max_freq;
+    int  min_freq;
+    int  set_gov;
+    int  avail_freq;
+    int  avail_govs;
+    int  driver;
+    int  set_freq;
+    int  conf_max_freq;
+    int  conf_min_freq;
+};
+
+char* cpufreq_filenames[MAX_FREQ_LOCS] ={
+    [FREQ_LOC_CUR] = "scaling_cur_freq",
+    [FREQ_LOC_MAX] = "scaling_max_freq",
+    [FREQ_LOC_MIN] = "scaling_min_freq",
+    [FREQ_LOC_AVAIL_FREQ] = "scaling_available_frequencies",
+    [FREQ_LOC_AVAIL_GOV] = "scaling_available_governors",
+    [FREQ_LOC_GOV] = "scaling_governor",
+    [FREQ_LOC_CONF_MAX] = "cpuinfo_max_freq",
+    [FREQ_LOC_CONF_MIN] = "cpuinfo_min_freq",
+};
+
+static void close_cpu(struct cpufreq_files* cpufiles)
+{
+    if (cpufiles)
+    {
+        if (cpufiles->cur_freq >= 0)
+        {
+            //printf( "Close cur_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->cur_freq);
+            cpufiles->cur_freq = -1;
+        }
+        if (cpufiles->max_freq >= 0)
+        {
+            //printf( "Close max_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->max_freq);
+            cpufiles->max_freq = -1;
+        }
+        if (cpufiles->min_freq >= 0)
+        {
+            //printf( "Close min_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->min_freq);
+            cpufiles->min_freq = -1;
+        }
+        if (cpufiles->set_freq >= 0)
+        {
+            //printf( "Close set_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->set_freq);
+            cpufiles->set_freq = -1;
+        }
+        if (cpufiles->set_gov >= 0)
+        {
+            //printf( "Close set_gov %d\n", cpufiles->cur_freq);
+            close(cpufiles->set_gov);
+            cpufiles->set_gov = -1;
+        }
+        if (cpufiles->avail_freq >= 0)
+        {
+            //printf( "Close avail_freq %d\n", cpufiles->cur_freq);
+            close(cpufiles->avail_freq);
+            cpufiles->avail_freq = -1;
+        }
+        if (cpufiles->avail_govs >= 0)
+        {
+            //printf( "Close avail_govs %d\n", cpufiles->cur_freq);
+            close(cpufiles->avail_govs);
+            cpufiles->avail_govs = -1;
+        }
+        if (cpufiles->driver >= 0)
+        {
+            //printf( "Close driver %d\n", cpufiles->cur_freq);
+            close(cpufiles->driver);
+            cpufiles->driver = -1;
+        }
+        if (cpufiles->conf_min_freq >= 0)
+        {
+            //printf( "Close conf_min_freq %d\n", cpufiles->conf_min_freq);
+            close(cpufiles->conf_min_freq);
+            cpufiles->conf_min_freq = -1;
+        }
+        if (cpufiles->conf_max_freq >= 0)
+        {
+            //printf( "Close conf_max_freq %d\n", cpufiles->conf_max_freq);
+            close(cpufiles->conf_max_freq);
+            cpufiles->conf_max_freq = -1;
+        }
+    }
+}
+
+static int open_cpu_file(char* filename, int* fd)
+{
+    int f = -1;
+    int access_flag = R_OK|W_OK;
+    int open_flag = O_RDWR;
+
+    f = open(filename, open_flag);
+    if (f < 0)
+    {
+        open_flag = O_RDONLY;
+        f = open(filename, open_flag);
+        if (f < 0)
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, "Failed to open file %s \n", filename);
+            *fd = -1;
+            return 0;
+        }
+    }
+    *fd = f;
+    //printf( "Opened %s %s = %d\n", filename, (open_flag == O_RDONLY ? "readable" : "writable"), *fd);
+    return 0;
+}
+
+
+static int open_cpu(int cpu, struct cpufreq_files* files)
+{
+    //char dname[1025];
+    int ret = 0;
+    char fname[1025];
+
+    FILE* fp = NULL;
+    if (cpu >= 0)
+    {
+        memset(files, -1, sizeof(struct cpufreq_files));
+
+        ret = snprintf(fname, 1024, "%s%d%s/%s", basefolder1, cpu, basefolder2, "scaling_cur_freq");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->cur_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s%d%s/%s", basefolder1, cpu, basefolder2, "scaling_max_freq");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->max_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s%d%s/%s", basefolder1, cpu, basefolder2, "scaling_min_freq");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->min_freq) < 0)
+            {
+                goto cleanup;
+            }
+        }
+        ret = snprintf(fname, 1024, "%s%d%s/%s", basefolder1, cpu, basefolder2, "scaling_governor");
+        if (ret > 0)
+        {
+            fname[ret] = '\0';
+            if (open_cpu_file(fname, &files->set_gov) < 0)
+            {
+                goto cleanup;
+            }
+        }
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_setspeed");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->set_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_available_governors");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->avail_govs) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_available_frequencies");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->avail_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "scaling_driver");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->driver) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "cpuinfo_min_freq");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->conf_min_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+/*        ret = snprintf(fname, 1024, "%s/%s", dname, "cpuinfo_max_freq");*/
+/*        if (ret > 0)*/
+/*        {*/
+/*            fname[ret] = '\0';*/
+/*            if (open_cpu_file(fname, &files->conf_max_freq) < 0)*/
+/*            {*/
+/*                goto cleanup;*/
+/*            }*/
+/*        }*/
+        return 0;
+    }
+cleanup:
+    close_cpu(files);
+    return -1;
+}
+
+
+static int
+freq_client_startDaemon()
+{
+    /* Check the function of the daemon here */
+    int res = 0;
+    char* filepath;
+    char *newargv[] = { NULL };
+    char *newenv[] = { NULL };
+    //char *safeexeprog = TOSTRING(FREQDAEMON);
+    char *exeprog = TOSTRING(FREQDAEMON);
+    struct sockaddr_un address;
+    size_t address_length;
+    int  ret;
+    pid_t pid;
+    int timeout = 1000;
+    int socket_fd = -1;
+    int print_once = 0;
+
+    if (access(exeprog, X_OK))
+    {
+        fprintf(stderr, "Failed to find the daemon '%s'\n", exeprog);
+        exit(EXIT_FAILURE);
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Starting daemon %s, exeprog);
+    pid = fork();
+
+    if (pid == 0)
+    {
+/*        Remove pinning here and delay it until first read or write call to check*/
+/*        if we are running in a multi-threaded environment.*/
+/*        if (cpu_id >= 0)*/
+/*        {*/
+/*            cpu_set_t cpuset;*/
+/*            CPU_ZERO(&cpuset);*/
+/*            CPU_SET(cpu_id, &cpuset);*/
+/*            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);*/
+/*        }*/
+        ret = execve (exeprog, newargv, newenv);
+
+        if (ret < 0)
+        {
+            //ERRNO_PRINT;
+            fprintf(stderr, "Failed to execute the daemon '%s'\n", exeprog);
+            exit(EXIT_FAILURE);
+        }
+    }
+    else if (pid < 0)
+    {
+        fprintf(stderr, "Failed to fork frequency daemon");
+        return pid;
+    }
+
+    EXIT_IF_ERROR(socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0), socket() failed);
+
+    address.sun_family = AF_LOCAL;
+    address_length = sizeof(address);
+    snprintf(address.sun_path, sizeof(address.sun_path), TOSTRING(LIKWIDSOCKETBASE) "-freq-%d", pid);
+    filepath = strdup(address.sun_path);
+
+    res = connect(socket_fd, (struct sockaddr *) &address, address_length);
+    while (res && timeout > 0)
+    {
+        usleep(2500);
+        res = connect(socket_fd, (struct sockaddr *) &address, address_length);
+
+        if (res == 0)
+        {
+            break;
+        }
+
+        timeout--;
+        //fprintf(stderr, "Still waiting for socket %s...\n", filepath);
+    }
+
+    if (timeout <= 0)
+    {
+        //ERRNO_PRINT;  /* should hopefully still work, as we make no syscalls in between. */
+        fprintf(stderr, "Exiting due to timeout: The socket file at '%s' could not be\n", filepath);
+        fprintf(stderr, "opened within 10 seconds. Consult the error message above\n");
+        fprintf(stderr, "this to find out why. If the error is 'no such file or directoy',\n");
+        fprintf(stderr, "it usually means that likwid-accessD just failed to start.\n");
+        exit(EXIT_FAILURE);
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Successfully opened socket %s to daemon, filepath);
+    free(filepath);
+
+    return socket_fd;
+}
+
+
+static void freq_init_direct()
+{
+    //printf("Calling %s\n", __func__);
+    int threads = cpuid_topology.numHWThreads;
+    cpufiles = malloc(threads* sizeof(struct cpufreq_files));
+    if (!cpufiles)
+    {
+        fprintf(stderr,"Failed to allocate space\n");
+        return;
+    }
+    for (int i=0;i<threads;i++)
+    {
+        memset(&cpufiles[i], -1, sizeof(struct cpufreq_files));
+        int ret = open_cpu(i, &cpufiles[i]);
+        if (ret < 0)
+        {
+            fprintf(stderr,"Failed to open files for CPU %d\n", i);
+        }
+    }
+    return;
+}
+
+static int freq_read_location(FreqDataRecordLocation loc, int cpu, int len, char* data)
+{
+    char fname[1024];
+    int fd = -1;
+    //printf("Calling %s\n", __func__);
+    int ret = snprintf(fname, 1023, "%s%d%s/%s", basefolder1, cpu, basefolder2, cpufreq_filenames[loc]);
+    if (ret > 0)
+    {
+        fname[ret] = '\0';
+        ret = open_cpu_file(fname, &fd);
+        if (ret == 0)
+        {
+            lseek(fd, 0, SEEK_SET);
+            ret = read(fd, data, len);
+            close(fd);
+            if (ret < 0)
+                return ret;
+            return 0;
+        }
+    }
+    return 0;
+}
+
+static int freq_send_direct(FreqDataRecordType type, FreqDataRecordLocation loc, int cpu, int len, char* data)
+{
+    //printf("Calling %s\n", __func__);
+    int fd = -1;
+    int ret = 0;
+    int only_read = 0;
+    struct cpufreq_files* f = &cpufiles[cpu];
+
+    switch(loc)
+    {
+        case FREQ_LOC_CUR:
+            fd = f->cur_freq;
+            only_read = 1;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_CUR FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_MIN:
+            fd = f->min_freq;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_MIN FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_MAX:
+            fd = f->max_freq;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_MAX FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_GOV:
+            fd = f->set_gov;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_GOV FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_AVAIL_GOV:
+            fd = f->avail_govs;
+            only_read = 1;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_AVAIL_GOV FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_AVAIL_FREQ:
+            fd = f->avail_freq;
+            only_read = 1;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_AVAIL_FREQ FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_CONF_MIN:
+            fd = f->conf_min_freq;
+            only_read = 1;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_CONF_MIN FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        case FREQ_LOC_CONF_MAX:
+            fd = f->conf_max_freq;
+            only_read = 1;
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, CMD %s CPU %d FREQ_LOC_CONF_MAX FD %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, fd);
+            break;
+        default:
+            fprintf(stderr,"Invalid location specified in record\n");
+            break;
+    }
+    if (fd > 0)
+    {
+        switch (type)
+        {
+            case FREQ_WRITE:
+                if (only_read)
+                {
+                    return -EPERM;
+                }
+                lseek(fd, 0, SEEK_SET);
+                ret = write(fd, data, len);
+                break;
+            case FREQ_READ:
+                lseek(fd, 0, SEEK_SET);
+                ret = read(fd, data, len);
+                break;
+        }
+        if (ret < 0)
+            return ret;
+    }
+    else
+    {
+        ret = freq_read_location(loc, cpu, len, data);
+        if (ret < 0)
+            return ret;
+    }
+    return 0;
+}
+
+static void freq_finalize_direct()
+{
+    //printf("Calling %s\n", __func__);
+    int threads = cpuid_topology.numHWThreads;
+    if (cpufiles)
+    {
+        for (int i=0;i<threads;i++)
+        {
+            close_cpu(&cpufiles[i]);
+        }
+        free(cpufiles);
+        cpufiles = NULL;
+    }
+    return;
+}
+
+static void freq_init_client()
+{
+    //printf("Calling %s\n", __func__);
+    fsocket = freq_client_startDaemon();
+    return;
+}
+
+static int freq_send_client(FreqDataRecordType type, FreqDataRecordLocation loc, int cpu, int len, char* data)
+{
+    //printf("Calling %s\n", __func__);
+    FreqDataRecord record;
+    if (fsocket >= 0)
+    {
+        memset(&record, 0, sizeof(FreqDataRecord));
+        record.type = type;
+        record.loc = loc;
+        record.cpu = cpu;
+        record.errorcode = FREQ_ERR_NONE;
+        snprintf(record.data, LIKWID_FREQUENCY_MAX_DATA_LENGTH, "%.*s", len, data);
+        record.datalen = len;
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, DAEMON CMD %s CPU %d LOC %d, (type == FREQ_WRITE ? "WRITE" : "READ"), cpu, loc);
+        CHECK_ERROR(write(fsocket, &record, sizeof(FreqDataRecord)),socket write failed);
+        CHECK_ERROR(read(fsocket, &record, sizeof(FreqDataRecord)), socket read failed);
+        if (record.errorcode != FREQ_ERR_NONE)
+        {
+            switch(record.errorcode)
+            {
+                case FREQ_ERR_NOFILE:
+                    return -ENOENT;
+                    break;
+                case FREQ_ERR_NOPERM:
+                    return -EACCES;
+                    break;
+                case FREQ_ERR_UNKNOWN:
+                    return -EBADF;
+                    break;
+            }
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static void freq_finalize_client()
+{
+    //printf("Calling %s\n", __func__);
+    FreqDataRecord record;
+    if (fsocket >= 0)
+    {
+        memset(&record, 0, sizeof(FreqDataRecord));
+        record.type = FREQ_EXIT;
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, DAEMON CMD CLOSE);
+        CHECK_ERROR(write(fsocket, &record, sizeof(FreqDataRecord)),socket write failed);
+        CHECK_ERROR(close(fsocket),socket close failed);
+        fsocket = -1;
+    }
+    return;
+}
+
+
+static int getAMDTurbo(const int cpu_id)
+{
+    int err = 0;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return 0;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+    else
+    {
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpu_id, MSR_DEV, 0xC0010015, &tmp);
+    if (err)
+    {
+        ERROR_PLAIN_PRINT(Cannot read register 0xC0010015);
+        return err;
+    }
+    
+    err = ((tmp >> 25) & 0x1);
+    return err == 0;
+}
+
+static int setAMDTurbo(const int cpu_id, const int turbo)
+{
+    int err = 0;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return -EPERM;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+    else
+    {
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpu_id, MSR_DEV, 0xC0010015, &tmp);
+    if (err)
+    {
+        ERROR_PLAIN_PRINT(Cannot read register 0xC0010015);
+        return err;
+    }
+
+    if (turbo)
+    {
+        tmp &= ~(1ULL<<25);
+    }
+    else
+    {
+        tmp |= (1ULL << 25);
+    }
+    err = HPMwrite(cpu_id, MSR_DEV, 0xC0010015, tmp);
+    if (err)
+    {
+        ERROR_PLAIN_PRINT(Cannot write register 0xC0010015);
+        return err;
+    }
+
+    return err == 0;
+}
+
+static int getIntelTurbo(const int cpu_id)
+{
+    int err = 0;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return 0;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+    else
+    {
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, &tmp);
+    if (err)
+    {
+        ERROR_PRINT(Cannot read register 0x%x, MSR_IA32_MISC_ENABLE);
+        return err;
+    }
+
+    err = ((tmp >> 38) & 0x1);
+    return err == 0;
+}
+
+static int setIntelTurbo(const int cpu_id, const int turbo)
+{
+    int err = 0;
+    
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return -EPERM;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+    else
+    {
+        err = HPMaddThread(cpu_id);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return err;
+        }
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, &tmp);
+    if (err)
+    {
+        ERROR_PRINT(Cannot read register 0x%x, MSR_IA32_MISC_ENABLE);
+        return err;
+    }
+    if (turbo)
+    {
+        tmp &= ~(1ULL << 38);
+    }
+    else
+    {
+        tmp |= (1ULL << 38);
+    }
+    err = HPMwrite(cpu_id, MSR_DEV, MSR_IA32_MISC_ENABLE, tmp);
+    if (err)
+    {
+        ERROR_PRINT(Cannot write register 0x%x, MSR_IA32_MISC_ENABLE);
+        return err;
+    }
+    return err == 0;
+}
+
+int
+_freqInit(void)
+{
+    int ret = 0;
+
+    if (freq_init_f == NULL)
+    {
+#if defined(__x86_64__) || defined(__i386__)
+        if (config.daemonMode == -1)
+        {
+            config.daemonMode = ACCESSMODE_DAEMON;
+        }
+        if (config.daemonMode == ACCESSMODE_DAEMON)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for daemon mode);
+            freq_init_f = freq_init_client;
+            freq_send = freq_send_client;
+            freq_finalize_f = freq_finalize_client;
+        }
+        else if (config.daemonMode == ACCESSMODE_DIRECT)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Adjusting functions for direct mode);
+            freq_init_f = freq_init_direct;
+            freq_send = freq_send_direct;
+            freq_finalize_f = freq_finalize_direct;
+        }
+        else if (config.daemonMode == ACCESSMODE_PERF)
+        {
+            DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, Frequency module not usable in perf_event mode);
+        }
+        else
+        {
+            ret = 1;
+        }
+#endif
+        if (freq_init_f)
+        {
+            freq_init_f();
+        }
+        if (freq_init_f != freq_init_direct)
+        {
+            freq_init_direct();
+        }
+        freq_initialized = 1;
+    }
+    return ret;
+}
+
+void _freqFinalize(void)
+{
+    if (freq_finalize_f)
+    {
+        freq_finalize_f();
+    }
+    if (freq_finalize_f != freq_finalize_direct)
+    {
+        freq_finalize_direct();
+    }
+    freq_initialized = 0;
+    freq_finalize_f = NULL;
+    freq_send = NULL;
+    freq_init_f = NULL;
+    if (own_hpm)
+        HPMfinalize();
+}
+
+int freq_init(void)
+{
+    return _freqInit();
+}
+
+void freq_finalize(void)
+{
+    _freqFinalize();
+}
+
+uint64_t freq_setCpuClockMax(const int cpu_id, const uint64_t freq)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = snprintf(s, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1, "%lu", freq);
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    if (ret > 0)
+    {
+        s[ret] = '\0';
+        ret = freq_send(FREQ_WRITE, FREQ_LOC_MAX, cpu_id, ret, s);
+        if (!ret)
+        {
+            return freq;
+        }
+    }
+    return -EINVAL;
+}
+
+uint64_t freq_setCpuClockMin(const int cpu_id, const uint64_t freq)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = snprintf(s, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1, "%lu", freq);
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    if (ret > 0)
+    {
+        s[ret] = '\0';
+        ret = freq_send(FREQ_WRITE, FREQ_LOC_MIN, cpu_id, ret, s);
+        if (!ret)
+        {
+            return freq;
+        }
+    }
+    return 0;
+}
+
+uint64_t freq_setCpuClockCurrent(const int cpu_id, const uint64_t freq)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = snprintf(s, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1, "%lu", freq);
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    if (ret > 0)
+    {
+        s[ret] = '\0';
+        ret = freq_send(FREQ_WRITE, FREQ_LOC_CUR, cpu_id, ret, s);
+        if (!ret)
+        {
+            return freq;
+        }
+    }
+    return 0;
+}
+
+
+int freq_setGovernor(const int cpu_id, const char* gov)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = snprintf(s, LIKWID_FREQUENCY_MAX_DATA_LENGTH-1, "%s", gov);
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    if (ret > 0)
+    {
+        s[ret] = '\0';
+        return freq_send(FREQ_WRITE, FREQ_LOC_GOV, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    }
+    return -EINVAL;
+}
+
+
+uint64_t freq_getCpuClockCurrent(const int cpu_id)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = freq_send_direct(FREQ_READ, FREQ_LOC_CUR, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    if (!ret)
+    {
+        uint64_t f = strtoull(s, NULL, 10);
+        if (f > 0)
+        {
+            return f;
+        }
+    }
+    return 0;
+}
+
+uint64_t freq_getCpuClockMin(const int cpu_id)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = freq_send_direct(FREQ_READ, FREQ_LOC_MIN, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    if (!ret)
+    {
+        uint64_t f = strtoull(s, NULL, 10);
+        if (f > 0)
+        {
+            return f;
+        }
+    }
+    return -1;
+}
+
+uint64_t freq_getCpuClockMax(const int cpu_id)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = freq_send_direct(FREQ_READ, FREQ_LOC_MAX, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    if (!ret)
+    {
+        uint64_t f = strtoull(s, NULL, 10);
+        if (f > 0)
+        {
+            return f;
+        }
+    }
+    return -1;
+}
+
+uint64_t freq_getConfCpuClockMin(const int cpu_id)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = freq_send_direct(FREQ_READ, FREQ_LOC_CONF_MIN, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    if (!ret)
+    {
+        uint64_t f = strtoull(s, NULL, 10);
+        if (f > 0)
+        {
+            return f;
+        }
+    }
+    return -1;
+}
+
+uint64_t freq_getConfCpuClockMax(const int cpu_id)
+{
+    char s[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+    int ret = freq_send_direct(FREQ_READ, FREQ_LOC_CONF_MAX, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+    if (!ret)
+    {
+        uint64_t f = strtoull(s, NULL, 10);
+        if (f > 0)
+        {
+            return f;
+        }
+    }
+    return -1;
+}
+
+char * freq_getGovernor(const int cpu_id )
+{
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    char *s = malloc(LIKWID_FREQUENCY_MAX_DATA_LENGTH * sizeof(char));
+    if (s)
+    {
+        memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+        int ret = freq_send_direct(FREQ_READ, FREQ_LOC_GOV, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+        if (!ret)
+        {
+            s[strlen(s)-1] = '\0';
+            return s;
+        }
+        free(s);
+    }
+    return NULL;
+}
+
+char * freq_getAvailFreq(const int cpu_id )
+{
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    char *s = malloc(LIKWID_FREQUENCY_MAX_DATA_LENGTH * sizeof(char));
+    if (s)
+    {
+        memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+        int ret = freq_send_direct(FREQ_READ, FREQ_LOC_AVAIL_FREQ, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+        if (!ret)
+        {
+            s[strlen(s)-1] = '\0';
+            return s;
+        }
+        free(s);
+    }
+    return NULL;
+}
+
+char * freq_getAvailGovs(const int cpu_id )
+{
+    if (!freq_initialized)
+    {
+        _freqInit();
+    }
+    char *s = malloc(LIKWID_FREQUENCY_MAX_DATA_LENGTH * sizeof(char));
+    if (s)
+    {
+        memset(s, '\0', LIKWID_FREQUENCY_MAX_DATA_LENGTH*sizeof(char));
+        int ret = freq_send_direct(FREQ_READ, FREQ_LOC_AVAIL_GOV, cpu_id, LIKWID_FREQUENCY_MAX_DATA_LENGTH, s);
+        if (!ret)
+        {
+            s[strlen(s)-1] = '\0';
+            return s;
+        }
+        free(s);
+    }
+    return NULL;
+}
+
+int freq_getTurbo(const int cpu_id)
+{
+    if (isAMD())
+        return getAMDTurbo(cpu_id);
+    else
+        return getIntelTurbo(cpu_id);
+    return 1;
+}
+
+int freq_setTurbo(const int cpu_id, const int turbo)
+{
+    if (isAMD())
+        return setAMDTurbo(cpu_id, turbo);
+    else
+        return setIntelTurbo(cpu_id, turbo);
+    return 1;
+}
+
+void __attribute__((destructor (104))) close_frequency_cpu(void)
+{
+    _freqFinalize();
+}
diff --git a/src/frequency_pstate.c b/src/frequency_pstate.c
deleted file mode 100644
index a48fd959b..000000000
--- a/src/frequency_pstate.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  frequency_pstate.c
- *
- *      Description:  Module implementing an interface for frequency manipulation, the
- *                    Intel PState backend
- *
- *      Version:   <VERSION>
- *      Released:  <DATE>
- *
- *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
- *                Amin Nabikhani, amin.nabikhani@gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <math.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include <bstrlib.h>
-#include <likwid.h>
-#include <types.h>
-#include <error.h>
-#include <topology.h>
-#include <access.h>
-#include <registers.h>
-
-#include <frequency_pstate.h>
-
-static unsigned int freqs[100];
-static unsigned int percent[100];
-static unsigned int num_steps = 0;
-
-static int mode()
-{
-    char readval[5];
-    int tmode;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/no_turbo","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 5, fp) )
-        {
-            tmode = atoi(readval);
-        }
-        fclose(fp);
-    }
-    return tmode;
-}
-
-static unsigned int turbo_pct()
-{
-    char readval[4];
-    unsigned int turbo_pct;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/turbo_pct","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 4, fp) )
-        {
-            turbo_pct = strtoul(readval,NULL,10);
-        }
-        fclose(fp);
-    }
-    return turbo_pct;
-}
-
-
-static unsigned int getMax()
-{
-    char line[1024];
-    unsigned int maxFreq = 0;
-    unsigned int trb = turbo_pct();
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/cpufreq/policy0/cpuinfo_max_freq", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        maxFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-    if(maxFreq != 0)
-    {
-        int t = mode();
-        if (t != 0)
-        {
-            maxFreq /= (1+0.01*trb);
-        }
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-    return maxFreq;
-}
-
-
-static unsigned int getMin()
-{
-    char line[1024];
-    unsigned int minFreq = 0;
-    char* eptr;
-    FILE* fp = fopen("/sys/devices/system/cpu/cpufreq/policy0/cpuinfo_min_freq", "r");
-    if(fp != NULL)
-    {
-        eptr = fgets(line, 1024, fp);
-        minFreq = strtoul(line, NULL, 10);
-        fclose(fp);
-    }
-    else
-    {
-        fprintf(stderr, "\tEXIT WITH ERROR:  Max Freq. could not be read\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return minFreq;
-}
-
-
-
-static unsigned int num_pstates()
-{
-    char readval[4];
-    unsigned int num;
-    FILE* fp = fopen("/sys/devices/system/cpu/intel_pstate/num_pstates","r");
-    if (fp != NULL)
-    {
-        while( fgets(readval, 4, fp) )
-        {
-            num = strtoul(readval,NULL,10);
-        }
-        fclose(fp);
-    }
-    else
-    {
-        exit(1);
-    }
-    return num;
-}
-
-
-
-static void steps()
-{
-    unsigned int minFreq = getMin();
-    unsigned int trb = turbo_pct();
-    unsigned int maxFreq = getMax();
-    unsigned int step = num_pstates();
-    int range = 0;
-
-    
-    if(step != 0)
-    {
-        range = (maxFreq-minFreq)/step;
-        freqs[0] = minFreq;
-        freqs[step-1]= maxFreq;
-        percent[0] = (minFreq/(float)maxFreq) * 100;
-        percent[step-1] = 100;
-        double t = 0;
-
-        for(size_t i=1; i < step-1; i++)
-        {
-            freqs[i] = minFreq+ i* range;
-            t = (((double)(freqs[i]))/((double)maxFreq)) * 100;
-            percent[i] = (unsigned int)t;
-        }
-        num_steps = step;
-    }
-    else
-    {
-        fprintf(stderr,"\tEXIT WITH ERROR:  # of pstates could not be read");
-    }
-}
-
-
-uint64_t freq_pstate_getCpuClockMax(const int cpu_id )
-{
-    char buff[256];
-    unsigned int pct = 0;
-    unsigned int maxFreq = getMax();
-    if (num_steps == 0)
-    {
-        steps();
-    }
-    uint64_t clock = ((percent[num_steps-1]) * maxFreq) * 10;
-    FILE* f = fopen("/sys/devices/system/cpu/intel_pstate/max_perf_pct","r");
-    if (f != NULL)
-    {
-        char *eptr = fgets(buff, 256, f);
-        if (eptr != NULL)
-        {
-            pct = strtoull(buff, NULL, 10);
-            for (int i=num_steps-1; i >= 0; i--)
-            {
-                if (percent[i] == pct)
-                {
-                    //clock = freqs[i]*1000;
-                    clock = ((percent[i]) * maxFreq) * 10; // *1000/100
-                    break;
-                }
-            }
-        }
-        fclose(f);
-    }
-    return clock;
-}
-
-
-
-uint64_t freq_pstate_getCpuClockMin(const int cpu_id )
-{
-    char buff[256];
-    unsigned int pct = 0;
-    unsigned int maxFreq = getMax();
-    if (num_steps == 0)
-    {
-        steps();
-    }
-    uint64_t clock = ((percent[0]) * maxFreq) * 10;
-    FILE* f = fopen("/sys/devices/system/cpu/intel_pstate/min_perf_pct","r");
-    if (f != NULL)
-    {
-        char *eptr = fgets(buff, 256, f);
-        if (eptr != NULL)
-        {
-            pct = strtoull(buff, NULL, 10);
-            for (int i=0; i < num_steps; i++)
-            {
-                if (percent[i] == pct)
-                {
-                    if (i > 0)
-                        clock = ((percent[i-1]) * maxFreq) * 10;
-                    else
-                        clock = ((percent[i]) * maxFreq) * 10;
-                    break;
-                }
-            }
-        }
-        fclose(f);
-    }
-    return clock;
-}
-
-int freq_pstate_getTurbo(const int cpu_id )
-{
-    return (mode() ? 0 : 1);
-}
-
diff --git a/src/frequency_uncore.c b/src/frequency_uncore.c
new file mode 100644
index 000000000..1d6664b01
--- /dev/null
+++ b/src/frequency_uncore.c
@@ -0,0 +1,421 @@
+
+#include <bstrlib.h>
+#include <likwid.h>
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <access.h>
+#include <registers.h>
+#include <lock.h>
+
+#include <frequency.h>
+
+
+static int _freq_getUncoreMinMax(const int socket_id, int *cpuId, double* min, double* max)
+{
+    int cpu = -1;
+    *cpuId = -1;
+    *min = 0;
+    *max = 0;
+    for (int i=0; i<cpuid_topology.numHWThreads; i++)
+    {
+        if (cpuid_topology.threadPool[i].packageId == socket_id)
+        {
+            cpu = cpuid_topology.threadPool[i].apicId;
+            break;
+        }
+    }
+    if (cpu < 0)
+    {
+        fprintf(stderr, "Unknown socket ID %d\n", socket_id);
+        return -ENODEV;
+    }
+
+    char* avail = freq_getAvailFreq(cpu);
+    if (!avail)
+    {
+        avail = malloc(1000 * sizeof(char));
+        if (avail)
+        {
+            int ret = snprintf(avail, 999, "%d %d", freq_getConfCpuClockMin(cpu)/1000000, freq_getConfCpuClockMax(cpu)/1000000);
+            if (ret > 0)
+            {
+                avail[ret] = '\0';
+            }
+            else
+            {
+                free(avail);
+                fprintf(stderr, "Failed to get available CPU frequencies\n");
+                return -EINVAL;
+            }
+        }
+        else
+        {
+            fprintf(stderr, "Failed to get available CPU frequencies\n");
+            return -EINVAL;
+        }
+    }
+
+    double dmin = 0.0;
+    double dmax = 0.0;
+    bstring bavail = bfromcstr(avail);
+    free(avail);
+    struct bstrList* bavail_list;
+    bavail_list = bsplit(bavail, ' ');
+    bdestroy(bavail);
+    if (bavail_list->qty < 2)
+    {
+        fprintf(stderr, "Failed to read minimal and maximal frequencies\n");
+        bstrListDestroy(bavail_list);
+        return -EINVAL;
+    }
+    if (blength(bavail_list->entry[0]) > 0)
+    {
+        char* tptr = NULL;
+        dmin = strtod(bdata(bavail_list->entry[0]), &tptr);
+        if (bdata(bavail_list->entry[0]) != tptr)
+        {
+            dmin *= 1000;
+        }
+        else
+        {
+            fprintf(stderr, "Problem converting %s to double for comparison with given freq.\n", bdata(bavail_list->entry[0]));
+            return -EINVAL;
+        }
+    }
+    if (blength(bavail_list->entry[bavail_list->qty-1]) > 0)
+    {
+        char* tptr = NULL;
+        dmax = strtod(bdata(bavail_list->entry[bavail_list->qty-1]), &tptr);
+        if (bdata(bavail_list->entry[bavail_list->qty-1]) != tptr)
+        {
+            dmax *= 1000;
+        }
+        else
+        {
+            fprintf(stderr, "Problem converting %s to double for comparison with given freq.\n", bdata(bavail_list->entry[bavail_list->qty-1]));
+            return -EINVAL;
+        }
+    }
+    bstrListDestroy(bavail_list);
+
+    *cpuId = cpu;
+    if (dmin < dmax)
+    {
+        *min = dmin;
+        *max = dmax;
+    }
+    else
+    {
+        *max = dmin;
+        *min = dmax;
+    }
+
+    power_init(cpu);
+    if (power_info.turbo.numSteps > 0)
+    {
+        if (power_info.turbo.steps[0] > *max)
+        {
+            *max = power_info.turbo.steps[0];
+        }
+    }
+
+    return 0;
+}
+
+
+int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq)
+{
+    int err = 0;
+    int own_hpm = 0;
+    int cpuId = -1;
+    uint64_t f = freq / 100;
+    double fmin, fmax;
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return -EPERM;
+    }
+    if (isAMD())
+    {
+        return 0;
+    }
+    err = _freq_getUncoreMinMax(socket_id, &cpuId, &fmin, &fmax);
+    if (err < 0)
+    {
+        return err;
+    }
+    if (freq < (uint64_t)fmin)
+    {
+        ERROR_PRINT(Given frequency %llu MHz lower than system limit of %.0f MHz, freq, fmin);
+        return -EINVAL;
+    }
+    if (freq > (uint64_t)fmax)
+    {
+        ERROR_PRINT(Given frequency %llu MHz higher than system limit of %.0f MHz, freq, fmax);
+        return -EINVAL;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+    }
+    err = HPMaddThread(cpuId);
+    if (err != 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+        return 0;
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
+    if (err)
+    {
+        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return err;
+    }
+    tmp &= ~(0xFF00);
+    tmp |= (f<<8);
+    err = HPMwrite(cpuId, MSR_DEV, MSR_UNCORE_FREQ, tmp);
+    if (err)
+    {
+        ERROR_PRINT(Cannot write register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return err;
+    }
+
+    if (own_hpm)
+        HPMfinalize();
+    return 0;
+}
+
+
+
+
+uint64_t freq_getUncoreFreqMin(const int socket_id)
+{
+    int err = 0;
+    int own_hpm = 0;
+    int cpuId = -1;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return 0;
+    }
+    if (isAMD())
+    {
+        return 0;
+    }
+    for (int i=0; i<cpuid_topology.numHWThreads; i++)
+    {
+        if (cpuid_topology.threadPool[i].packageId == socket_id)
+        {
+            cpuId = cpuid_topology.threadPool[i].apicId;
+            break;
+        }
+    }
+    if (cpuId < 0)
+    {
+        ERROR_PRINT(Unknown socket ID %d, socket_id);
+        return 0;
+    }
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+    }
+    err = HPMaddThread(cpuId);
+    if (err != 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+        return 0;
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
+    if (err)
+    {
+        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return 0;
+    }
+    tmp = ((tmp>>8) & 0xFFULL) * 100;
+
+    if (own_hpm)
+        HPMfinalize();
+    return tmp;
+}
+
+int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq)
+{
+    int err = 0;
+    int own_hpm = 0;
+    int cpuId = -1;
+    uint64_t f = freq / 100;
+    double fmin, fmax;
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return -EPERM;
+    }
+    if (isAMD())
+    {
+        return 0;
+    }
+    err = _freq_getUncoreMinMax(socket_id, &cpuId, &fmin, &fmax);
+    if (err < 0)
+    {
+        return err;
+    }
+    if (freq < (uint64_t)fmin)
+    {
+        ERROR_PRINT(Given frequency %llu MHz lower than system limit of %.0f MHz, freq, fmin);
+        return -EINVAL;
+    }
+    if (freq > (uint64_t)fmax)
+    {
+        ERROR_PRINT(Given frequency %llu MHz higher than system limit of %.0f MHz, freq, fmax);
+        return -EINVAL;
+    }
+
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+    }
+    err = HPMaddThread(cpuId);
+    if (err != 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+        return 0;
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
+    if (err)
+    {
+        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return err;
+    }
+    tmp &= ~(0xFFULL);
+    tmp |= (f & 0xFFULL);
+    err = HPMwrite(cpuId, MSR_DEV, MSR_UNCORE_FREQ, tmp);
+    if (err)
+    {
+        ERROR_PRINT(Cannot write register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return err;
+    }
+
+    if (own_hpm)
+        HPMfinalize();
+    return 0;
+}
+
+uint64_t freq_getUncoreFreqMax(const int socket_id)
+{
+    int err = 0;
+    int own_hpm = 0;
+    int cpuId = -1;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return 0;
+    }
+
+    if (isAMD())
+    {
+        return 0;
+    }
+    for (int i=0; i<cpuid_topology.numHWThreads; i++)
+    {
+        if (cpuid_topology.threadPool[i].packageId == socket_id)
+        {
+            cpuId = cpuid_topology.threadPool[i].apicId;
+            break;
+        }
+    }
+    if (cpuId < 0)
+    {
+        ERROR_PRINT(Unknown socket ID %d, socket_id);
+        return 0;
+    }
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+    }
+    err = HPMaddThread(cpuId);
+    if (err != 0)
+    {
+        ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+        return 0;
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ, &tmp);
+    if (err)
+    {
+        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ, cpuId);
+        return 0;
+    }
+    tmp = (tmp & 0xFFULL) * 100;
+
+    if (own_hpm)
+        HPMfinalize();
+    return tmp;
+}
+
+uint64_t freq_getUncoreFreqCur(const int socket_id)
+{
+    int err = 0;
+    int own_hpm = 0;
+    int cpuId = -1;
+
+    if (!lock_check())
+    {
+        fprintf(stderr,"Access to frequency backend is locked.\n");
+        return 0;
+    }
+    if (isAMD())
+    {
+        return 0;
+    }
+    for (int i=0; i<cpuid_topology.numHWThreads; i++)
+    {
+        if (cpuid_topology.threadPool[i].packageId == socket_id)
+        {
+            cpuId = cpuid_topology.threadPool[i].apicId;
+            break;
+        }
+    }
+    if (cpuId < 0)
+    {
+        ERROR_PRINT(Unknown socket ID %d, socket_id);
+        return 0;
+    }
+    if (!HPMinitialized())
+    {
+        HPMinit();
+        own_hpm = 1;
+        err = HPMaddThread(cpuId);
+        if (err != 0)
+        {
+            ERROR_PLAIN_PRINT(Cannot get access to MSRs)
+            return 0;
+        }
+    }
+
+    uint64_t tmp = 0x0ULL;
+    err = HPMread(cpuId, MSR_DEV, MSR_UNCORE_FREQ_READ, &tmp);
+    if (err)
+    {
+        //ERROR_PRINT(Cannot read register 0x%X on CPU %d, MSR_UNCORE_FREQ_READ, cpuId);
+        return 0;
+    }
+    tmp = (tmp & 0xFFULL) * 100;
+
+    if (own_hpm)
+        HPMfinalize();
+    return tmp;
+}
diff --git a/src/includes/bstrlib_helper.h b/src/includes/bstrlib_helper.h
index 0b6d50e47..631a8dc54 100644
--- a/src/includes/bstrlib_helper.h
+++ b/src/includes/bstrlib_helper.h
@@ -1,3 +1,33 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  bstrlib_helper.h
+ *
+ *      Description:  Additional functions to the bstrlib library (header file)
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2019 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
 #ifndef BSTRLIB_HELPER_INCLUDE
 #define BSTRLIB_HELPER_INCLUDE
 
@@ -20,6 +50,8 @@ void bstrListPrint(struct bstrList * sl);
 int btrimbrackets (bstring b);
 int bisnumber(bstring b);
 
+bstring read_file(char *filename);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/access-daemon/setFreq.c b/src/includes/calculator.h
similarity index 52%
rename from src/access-daemon/setFreq.c
rename to src/includes/calculator.h
index 0938c5be9..edca1d9e8 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/includes/calculator.h
@@ -1,18 +1,18 @@
 /*
  * =======================================================================================
  *
- *      Filename:  setFreq.c
+ *      Filename:  calculator.h
  *
- *      Description:  Entry point of frequency daemon
+ *      Description:  Header file for infix calculator
  *
- *      Version:   4.3.2
- *      Released:  12.04.2018
- *
- *      Authors:  Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *      Version:   4.2
+ *      Released:  22.12.2016
  *
+ *      Author:   Jan Treibig (jt), jan.treibig@gmail.com
+ *                Thomas Roehl (tr), thomas.roehl@gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2018 RRZE, University Erlangen-Nuremberg
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,36 +28,9 @@
  *
  * =======================================================================================
  */
+#ifndef CALCULATOR_H
+#define CALCULATOR_H
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <dirent.h>
-#include <errno.h>
-#include <setFreq.h>
-
-
-static int is_pstate()
-{
-    int ret = 1;
-    DIR* dir = opendir("/sys/devices/system/cpu/intel_pstate");
-    if (ENOENT == errno)
-    {
-        //fprintf(stderr, "\tEXIT WITH ERROR:  intel_pstate is not present!\n");
-        ret = 0;
-    }
-
-    closedir(dir);
-    return ret;
-}
-
+int calculate_infix(char* finfix, double *result);
 
-int main(int argc, char** argv)
-{
-    if (is_pstate())
-    {
-        printf("Pstate driver\n");
-        return do_pstate(argc, argv);
-    }
-    else
-        return do_cpufreq(argc, argv);
-}
+#endif
diff --git a/src/includes/calculator_stack.h b/src/includes/calculator_stack.h
new file mode 100644
index 000000000..235898078
--- /dev/null
+++ b/src/includes/calculator_stack.h
@@ -0,0 +1,52 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  calculator_stack.h
+ *
+ *      Description:  Stack implementation for infix calculator
+ *
+ *      Version:   4.2
+ *      Released:  22.12.2016
+ *
+ *      Author:   Brandon Mills (bm), mills.brandont@gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) Brandon Mills
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ *      software and associated documentation files (the "Software"), to deal in the
+ *      Softwarewithout restriction, including without limitation the rights to use, copy,
+ *      modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ *      and to permit persons to whom the Software is furnished to do so, subject to the
+ *      following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ *      INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ *      PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *      HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *      OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ *      SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CALCULATOR_STACK_H
+#define CALCULATOR_STACK_H
+
+typedef struct
+{
+    void **content;
+    int size;
+    int top;
+} Stack;
+
+void stackInit(Stack *s, int size);
+void stackPush(Stack *s, void* val);
+void* stackTop(Stack *s);
+void* stackPop(Stack *s);
+int stackSize(Stack *s);
+void stackFree(Stack *s);
+
+#endif /* CALCULATOR_STACK_H */
diff --git a/src/includes/frequency.h b/src/includes/frequency.h
index cc60abb2b..ff90f3456 100644
--- a/src/includes/frequency.h
+++ b/src/includes/frequency.h
@@ -37,6 +37,26 @@
 
 extern char* daemon_path;
 
+#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
+#include <cpuid.h>
+#endif
+
+#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
+static int isAMD()
+{
+    unsigned int eax,ebx,ecx,edx;
+    eax = 0x0;
+    CPUID(eax,ebx,ecx,edx);
+    if (ecx == 0x444d4163)
+        return 1;
+    return 0;
+}
+#else
+static int isAMD()
+{
+    return 0;
+}
+#endif
 
 
 #endif /* FREQUENCY_H */
diff --git a/src/includes/frequency_client.h b/src/includes/frequency_client.h
new file mode 100644
index 000000000..2791a845f
--- /dev/null
+++ b/src/includes/frequency_client.h
@@ -0,0 +1,41 @@
+#ifndef LIKWID_FREQUENCY_CLIENT_H
+#define LIKWID_FREQUENCY_CLIENT_H
+
+#define LIKWID_FREQUENCY_MAX_DATA_LENGTH   200
+
+typedef enum {
+    FREQ_READ = 0,
+    FREQ_WRITE,
+    FREQ_EXIT
+} FreqDataRecordType;
+
+
+typedef enum {
+    FREQ_LOC_MIN = 0,
+    FREQ_LOC_MAX,
+    FREQ_LOC_CUR,
+    FREQ_LOC_GOV,
+    FREQ_LOC_AVAIL_GOV,
+    FREQ_LOC_AVAIL_FREQ,
+    FREQ_LOC_CONF_MIN,
+    FREQ_LOC_CONF_MAX,
+    MAX_FREQ_LOCS
+}FreqDataRecordLocation;
+
+typedef enum {
+    FREQ_ERR_NONE = 0,
+    FREQ_ERR_NOFILE,
+    FREQ_ERR_NOPERM,
+    FREQ_ERR_UNKNOWN
+} FreqDataRecordError;
+
+typedef struct {
+    uint32_t cpu;
+    FreqDataRecordType type;
+    FreqDataRecordLocation loc;
+    FreqDataRecordError errorcode;
+    int datalen;
+    char data[LIKWID_FREQUENCY_MAX_DATA_LENGTH];
+} FreqDataRecord;
+
+#endif /* LIKWID_FREQUENCY_CLIENT_H */
diff --git a/src/includes/likwid-marker.h b/src/includes/likwid-marker.h
new file mode 100644
index 000000000..ceb16af26
--- /dev/null
+++ b/src/includes/likwid-marker.h
@@ -0,0 +1,99 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid-marker.h
+ *
+ *      Description:  Header File of likwid Marker API
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Authors:  Thomas Gruber (tg), thomas.roehl@googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_MARKER_H
+#define LIKWID_MARKER_H
+
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_RESET(regionTag)
+Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else  /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+#endif /* LIKWID_MARKER_H */
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index fbd053fa5..36978a65b 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -48,69 +48,6 @@
 extern int perfmon_verbosity;
 extern int likwid_nvmon_verbosity;
 
-/** \addtogroup MarkerAPI Marker API module
-*  @{
-*/
-/*!
-\def LIKWID_MARKER_INIT
-Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_THREADINIT
-Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_REGISTER(regionTag)
-Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_START(regionTag)
-Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_STOP(regionTag)
-Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
-Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_SWITCH
-Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_RESET(regionTag)
-Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/*!
-\def LIKWID_MARKER_CLOSE
-Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
-*/
-/** @}*/
-
-#ifdef LIKWID_PERFMON
-#define LIKWID_MARKER_INIT likwid_markerInit()
-#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
-#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
-#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
-#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
-#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
-#define LIKWID_MARKER_CLOSE likwid_markerClose()
-#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
-#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
-#else
-#define LIKWID_MARKER_INIT
-#define LIKWID_MARKER_THREADINIT
-#define LIKWID_MARKER_SWITCH
-#define LIKWID_MARKER_REGISTER(regionTag)
-#define LIKWID_MARKER_START(regionTag)
-#define LIKWID_MARKER_STOP(regionTag)
-#define LIKWID_MARKER_CLOSE
-#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
-#define LIKWID_MARKER_RESET(regionTag)
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -342,25 +279,26 @@ General information covers CPU family, model, name and current clock and vendor
 specific information like the version of Intel's performance monitoring facility.
 */
 typedef struct {
-    uint32_t family; /*!< \brief CPU family ID*/
-    uint32_t model; /*!< \brief CPU model ID */
-    uint32_t stepping; /*!< \brief Stepping (version) of the CPU */
-    uint32_t vendor; /*!< \brief Vendor of the CPU */
-    uint32_t part; /*!< \brief Part number of the CPU */
-    uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/
-    int      turbo; /*!< \brief Flag if CPU has a turbo mode */
-    char*  osname; /*!< \brief Name of the CPU reported by OS */
-    char*  name; /*!< \brief Name of the CPU as identified by LIKWID */
-    char*  short_name; /*!< \brief Short name of the CPU*/
-    char*  features; /*!< \brief String with all features supported by the CPU*/
+    uint32_t    family; /*!< \brief CPU family ID*/
+    uint32_t    model; /*!< \brief CPU model ID */
+    uint32_t    stepping; /*!< \brief Stepping (version) of the CPU */
+    uint32_t    vendor; /*!< \brief Vendor of the CPU */
+    uint32_t    part; /*!< \brief Part number of the CPU */
+    uint64_t    clock; /*!< \brief Current clock frequency of the executing CPU*/
+    int         turbo; /*!< \brief Flag if CPU has a turbo mode */
+    char*       osname; /*!< \brief Name of the CPU reported by OS */
+    char*       name; /*!< \brief Name of the CPU as identified by LIKWID */
+    char*       short_name; /*!< \brief Short name of the CPU*/
+    char*       features; /*!< \brief String with all features supported by the CPU*/
     int         isIntel; /*!< \brief Flag if it is an Intel CPU*/
-    int     supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
-    int     supportClientmem; /*!< \brief Flag if system has mappable memory controllers */
-    uint32_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/
-    uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */
-    uint32_t perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
-    uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
-    uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
+    char        architecture[20]; /*!< \brief name of the architecture like x86_64 or ppc64 (comparable with uname -m)*/
+    int         supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+    int         supportClientmem; /*!< \brief Flag if system has mappable memory controllers */
+    uint64_t    featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+    uint32_t    perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+    uint32_t    perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
+    uint32_t    perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+    uint32_t    perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
 } CpuInfo;
 
 /*! \brief Structure with IDs of a HW thread
@@ -529,6 +467,14 @@ extern void numa_setInterleaved(const int* processorList, int numberOfProcessors
 @param [in] domainId ID of NUMA node for the allocation
 */
 extern void numa_membind(void* ptr, size_t size, int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Set memory allocation policy to membind
+
+Set the memory allocation policy to membind for given list of CPUs. This forces
+allocation to be placed in NUMA domains spanning the given processor list.
+@param [in] processorList List of processors
+@param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setMembind(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
 /*! \brief Destroy NUMA information structure
 
 Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
@@ -896,7 +842,6 @@ extern int perfmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default
 */
 extern int perfmon_getNumberOfThreads(void) __attribute__ ((visibility ("default") ));
 
-
 /*! \brief Set verbosity of LIKWID library
 
 */
@@ -918,6 +863,7 @@ Get the counter name as defined in the performance group file
 @return The counter name or NULL in case of failure
 */
 extern char* perfmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+
 /*! \brief Get the name group
 
 Get the name of group. Either it is the name of the performance group or "Custom"
@@ -1037,6 +983,217 @@ extern double perfmon_getResultOfRegionThread(int region, int event, int thread)
 */
 extern double perfmon_getMetricOfRegionThread(int region, int metricId, int threadId) __attribute__ ((visibility ("default") ));
 
+/** @}*/
+
+/*
+################################################################################
+# Performance group related functions
+################################################################################
+*/
+
+/** \addtogroup PerfGroup performance group module
+ *  @{
+ */
+
+/*! \brief The groupInfo data structure describes a performance group
+
+Groups can be either be read in from file or be a group with custom event set. For
+performance groups commonly all values are set. For groups with custom event set,
+the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in
+general the nmetrics value is 0.
+*/
+typedef struct {
+    char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */
+    char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */
+    int nevents; /*!< \brief Number of event/counter combinations */
+    char** events; /*!< \brief List of events */
+    char** counters; /*!< \brief List of counter registers */
+    int nmetrics; /*!< \brief Number of metrics */
+    char** metricnames; /*!< \brief Metric names */
+    char** metricformulas; /*!< \brief Metric formulas */
+    char* longinfo; /*!< \brief Descriptive text about the group or empty */
+} GroupInfo;
+
+/*! \brief Initialize values in GroupInfo struct
+
+Initialize values in GroupInfo struct. The function does NOT allocate the GroupInfo struct
+*/
+int perfgroup_new(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Add a counter and event combination to the group
+
+Add a counter and event combination to the group.
+@param [in] ginfo GroupInfo struct
+@param [in] counter String with counter name
+@param [in] event String with event name
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_addEvent(GroupInfo* ginfo, char* counter, char* event) __attribute__ ((visibility ("default") ));
+
+/*! \brief Remove a counter and event combination from a group
+
+Remove a counter and event combination from a group
+@param [in] ginfo GroupInfo struct
+@param [in] counter String with counter name
+*/
+void perfgroup_removeEvent(GroupInfo* ginfo, char* counter) __attribute__ ((visibility ("default") ));
+
+/*! \brief Add a metric to the group
+
+Add a metric to the group
+@param [in] ginfo GroupInfo struct
+@param [in] mname String with metric name/description
+@param [in] mcalc String with metric formula. No spaces in string.
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_addMetric(GroupInfo* ginfo, char* mname, char* mcalc) __attribute__ ((visibility ("default") ));
+/*! \brief Remove a metric from a group
+
+Remove a metric from a group
+@param [in] ginfo GroupInfo struct
+@param [in] counter String with counter name
+*/
+void perfgroup_removeMetric(GroupInfo* ginfo, char* mname) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the event string of a group needed for perfmon_addEventSet
+
+Get the event string of a group needed for perfmon_addEventSet
+@param [in] ginfo GroupInfo struct
+@return String with eventset or NULL
+*/
+char* perfgroup_getEventStr(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the eventset string of a group
+
+Return the event string of a group
+@param [in] eventStr Eventset string
+*/
+void perfgroup_returnEventStr(char* eventStr) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the group name of a group
+
+Get the group name of a group
+@param [in] ginfo GroupInfo struct
+@return String with group name or NULL
+*/
+char* perfgroup_getGroupName(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Set the group name of a group
+
+Set the group name of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] groupName String with group name
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setGroupName(GroupInfo* ginfo, char* groupName) __attribute__ ((visibility ("default") ));
+/*! \brief Return the group name string of a group
+
+Return the group name string of a group
+@param [in] gname Group name string
+*/
+void perfgroup_returnGroupName(char* gname) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Set the short information string of a group
+
+Set the short information string of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] shortInfo String with short information
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setShortInfo(GroupInfo* ginfo, char* shortInfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short information string of a group
+
+Get the short information string of a group
+@param [in] ginfo GroupInfo struct
+@return String with short information or NULL
+*/
+char* perfgroup_getShortInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the short information string of a group
+
+Return the short information string of a group
+@param [in] sinfo Short information string
+*/
+void perfgroup_returnShortInfo(char* sinfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the long information string of a group
+
+Set the long information string of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] longInfo String with long information
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setLongInfo(GroupInfo* ginfo, char* longInfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long information string of a group
+
+Get the long information string of a group
+@param [in] ginfo GroupInfo struct
+@return String with long information or NULL
+*/
+char* perfgroup_getLongInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the long information string of a group
+
+Return the long information string of a group
+@param [in] sinfo Long information string
+*/
+void perfgroup_returnLongInfo(char* linfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Merge two groups
+
+Merge two groups (group2 into group1).
+@param [in,out] grp1 Group1
+@param [in] grp2 Group2
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_mergeGroups(GroupInfo* grp1, GroupInfo* grp2) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read group from file
+
+Read group from file
+@param [in] grouppath Base path to all groups
+@param [in] architecture Architecture string (e.g. short_info in cpuid_info)
+@param [in] groupname Group name
+@param [in,out] ginfo Group filled with data from file
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_readGroup(const char* grouppath, const char* architecture, const char* groupname, GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Create group from event string
+
+Create group from event string (list of event:counter(:opts)).
+@param [in] eventStr event string
+@param [in,out] ginfo Group filled with data from event string
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_customGroup(const char* eventStr, GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Return group
+
+Return group (frees internal lists)
+@param [in] ginfo Performance group info
+*/
+void perfgroup_returnGroup(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get all groups available in the system (base + user home)
+
+Get all groups available in the system (base + user home)
+@param [in] grouppath Base path to all groups
+@param [in] architecture Architecture string (e.g. short_info in cpuid_info)
+@param [out] groupnames List of group names
+@param [out] groupshort List of groups' short information string
+@param [out] grouplong List of groups' long information string
+@return number of groups, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_getGroups( const char* grouppath, const char* architecture, char*** groupnames, char*** groupshort, char*** grouplong) __attribute__ ((visibility ("default") ));
+/*! \brief Return list of all groups
+
+Return list of all groups
+@param [in] groups Number of groups
+@param [in] groupnames List of group names
+@param [in] groupshort List of groups' short information string
+@param [in] grouplong List of groups' long information string
+*/
+void perfgroup_returnGroups(int groups, char** groupnames, char** groupshort, char** grouplong) __attribute__ ((visibility ("default") ));
+
+
+
+
 /** @}*/
 
 /*
@@ -1471,6 +1628,12 @@ extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute_
 /** \addtogroup CpuFreq Retrieval and manipulation of processor clock frequencies
  *  @{
  */
+/*! \brief Initialize cpu frequency module
+
+Initialize cpu frequency module
+@return returns 0 if successfull and 1 if invalid accessmode
+*/
+extern int freq_init(void) __attribute__ ((visibility ("default") ));
 /*! \brief Get the current clock frequency of a core
 
 Get the current clock frequency of a core
@@ -1486,6 +1649,13 @@ Get the maximal clock frequency of a core
 @return Frequency or 0 in case of errors
 */
 extern uint64_t freq_getCpuClockMax(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Get the maximal available clock frequency of a core
+
+Get the maximal clock frequency of a core
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getConfCpuClockMax(const int cpu_id) __attribute__ ((visibility ("default") ));
 /*! \brief Set the maximal clock frequency of a core
 
 Set the maximal clock frequency of a core
@@ -1501,6 +1671,13 @@ Get the minimal clock frequency of a core
 @return Frequency or 0 in case of errors
 */
 extern uint64_t freq_getCpuClockMin(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Get the minimal available clock frequency of a core
+
+Get the minimal clock frequency of a core
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getConfCpuClockMin(const int cpu_id) __attribute__ ((visibility ("default") ));
 /*! \brief Set the minimal clock frequency of a core
 
 Set the minimal clock frequency of a core
@@ -1594,6 +1771,11 @@ Get the current Uncore frequency.
 @return frequency in MHz or 0 at failure
 */
 extern uint64_t freq_getUncoreFreqCur(const int socket_id) __attribute__ ((visibility ("default") ));
+/*! \brief Finalize cpu frequency module
+
+Finalize cpu frequency module
+*/
+extern void freq_finalize(void) __attribute__ ((visibility ("default") ));
 /** @}*/
 
 
diff --git a/src/includes/lock.h b/src/includes/lock.h
index e0104d5b1..cf33d95c6 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -91,7 +91,7 @@ lock_check(void)
         }
     }
 
-    if (lock_handle)
+    if (lock_handle > 0)
     {
         close(lock_handle);
     }
diff --git a/src/includes/numa.h b/src/includes/numa.h
index c379dd930..d6e0bf094 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -46,6 +46,7 @@ extern int str2int(const char* str);
 struct numa_functions {
     int (*numa_init) (void);
     void (*numa_setInterleaved) (const int*, int);
+    void (*numa_setMembind) (const int*, int);
     void (*numa_membind) (void*, size_t, int);
 };
 
diff --git a/src/includes/numa_proc.h b/src/includes/numa_proc.h
index e03ef1efc..c2af9fd00 100644
--- a/src/includes/numa_proc.h
+++ b/src/includes/numa_proc.h
@@ -33,5 +33,6 @@
 extern int proc_numa_init(void);
 extern void proc_numa_membind(void* ptr, size_t size, int domainId);
 extern void proc_numa_setInterleaved(const int* processorList, int numberOfProcessors);
+extern void proc_numa_setMembind(const int* processorList, int numberOfProcessors);
 
 #endif
diff --git a/src/includes/perfgroup.h b/src/includes/perfgroup.h
index 663d54140..be2119cb9 100644
--- a/src/includes/perfgroup.h
+++ b/src/includes/perfgroup.h
@@ -30,25 +30,10 @@
 #ifndef PERFGROUP_H
 #define PERFGROUP_H
 
- /*! \brief The groupInfo data structure describes a performance group
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
 
-Groups can be either be read in from file or be a group with custom event set. For
-performance groups commonly all values are set. For groups with custom event set,
-the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in
-general the nmetrics value is 0.
-*/
-typedef struct {
-    char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */
-    char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */
-    int nevents; /*!< \brief Number of event/counter combinations */
-    char** events; /*!< \brief List of events */
-    char** counters; /*!< \brief List of counter registers */
-    int nmetrics; /*!< \brief Number of metrics */
-    char** metricnames; /*!< \brief Metric names */
-    char** metricformulas; /*!< \brief Metric formulas */
-    char* longinfo; /*!< \brief Descriptive text about the group or empty */
-    char* lua_funcs; /*!< \brief Custom Lua functions used in metric formulas */
-} GroupInfo;
+#include <likwid.h>
 
 typedef enum {
     GROUP_NONE = 0,
@@ -69,26 +54,33 @@ static char* groupFileSectionNames[MAX_GROUP_FILE_SECTIONS] = {
     "LUA"
 };
 
-extern int get_groups(const char* grouppath, const char* architecture, char*** groupnames, char*** groupshort, char*** grouplong);
-extern void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong);
-extern int read_group(const char* grouppath, const char* architecture, const char* groupname, GroupInfo* ginfo);
-extern int custom_group(const char* eventStr, GroupInfo* ginfo);
-extern char* get_eventStr(GroupInfo* ginfo);
-void put_eventStr(char* eventset);
-extern char* get_shortInfo(GroupInfo* ginfo);
-void put_shortInfo(char* sinfo);
-extern char* get_longInfo(GroupInfo* ginfo);
-void put_longInfo(char* linfo);
-extern void return_group(GroupInfo* ginfo);
+typedef struct {
+    int counters; /*!< \brief Number of entries in the list */
+    struct bstrList* cnames; /*!< \brief List of counter names */
+    struct bstrList* cvalues; /*!< \brief List of counter values */
+} CounterList;
+
+//extern int get_groups(const char* grouppath, const char* architecture, char*** groupnames, char*** groupshort, char*** grouplong);
+//extern void return_groups(int groups, char** groupnames, char** groupshort, char** grouplong);
+//extern int read_group(const char* grouppath, const char* architecture, const char* groupname, GroupInfo* ginfo);
+//extern int custom_group(const char* eventStr, GroupInfo* ginfo);
+//extern char* get_eventStr(GroupInfo* ginfo);
+//void put_eventStr(char* eventset);
+//extern char* get_shortInfo(GroupInfo* ginfo);
+//void put_shortInfo(char* sinfo);
+//extern char* get_longInfo(GroupInfo* ginfo);
+//void put_longInfo(char* linfo);
+//extern void return_group(GroupInfo* ginfo);
+
+
+extern void init_clist(CounterList* clist);
+extern int add_to_clist(CounterList* clist, char* counter, double result);
+extern int update_clist(CounterList* clist, char* counter, double result);
+extern void destroy_clist(CounterList* clist);
+
+extern int calc_metric(char* formula, CounterList* clist, double *result);
+
 
 
 
-extern int calc_add_str_def(char* name, char* value, int cpu);
-extern int calc_add_int_def(char* name, int value, int cpu);
-extern int calc_add_dbl_def(char* name, double value, int cpu);
-extern int calc_add_str_var(char* name, char* value, bstring vars, bstring varlist);
-extern int calc_add_dbl_var(char* name, double value, bstring vars, bstring varlist);
-extern int calc_add_int_var(char* name, int value, bstring vars, bstring varlist);
-extern int calc_set_user_funcs(char* s);
-extern int calc_metric(int cpu, char* formula, bstring varstr, bstring varlist, double *result);
 #endif /* PERFGROUP_H */
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index 101d73082..e684372c8 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -70,8 +70,8 @@ UMASK_BUS_HIT_DRV_THIS_A                              0x00
 UMASK_BUS_HIT_DRV_ALL_A                               0x20
 
 EVENT_BUS_HITM_DRV                                    0x7B PMC
-UMASK_BUS_HITM_DRV                                    0x00
-UMASK_BUS_HITM_DRV                                    0x20
+UMASK_BUS_HITM_DRV_THIS_A                             0x00
+UMASK_BUS_HITM_DRV_ALL_A                              0x20
 
 EVENT_BUS_IO_WAIT                                     0x7F PMC
 UMASK_BUS_IO_WAIT_ALL_CORES                           0xC0
@@ -174,55 +174,55 @@ EVENT_CYCLES_DIV_BUSY            0x14   PMC0
 UMASK_CYCLES_DIV_BUSY            0x00
 
 EVENT_CYCLES_INT_MASKED                               0xC6 PMC
-UMASK_CYCLES_INT_MASKED_CYCLES_INT_MASKED             0x01 
-UMASK_CYCLES_INT_MASKED_CYCLES_INT_PENDING_AND_MASKED 0x02  
+UMASK_CYCLES_INT_MASKED_CYCLES_INT_MASKED             0x01
+UMASK_CYCLES_INT_MASKED_CYCLES_INT_PENDING_AND_MASKED 0x02
 
 EVENT_DATA_TLB_MISSES                                 0x08 PMC
-UMASK_DATA_TLB_MISSES_DTLB_MISS                       0x07 
-UMASK_DATA_TLB_MISSES.DTLB_MISS_LD                    0x05 
-UMASK_DATA_TLB_MISSES.DTLB_MISS_ST                    0x06 
-UMASK_DATA_TLB_MISSES.L0_DTLB_MISS_LD                 0x09 
+UMASK_DATA_TLB_MISSES_DTLB_MISS                       0x07
+UMASK_DATA_TLB_MISSES.DTLB_MISS_LD                    0x05
+UMASK_DATA_TLB_MISSES.DTLB_MISS_ST                    0x06
+UMASK_DATA_TLB_MISSES.L0_DTLB_MISS_LD                 0x09
 
 EVENT_DIV                                             0x13 PMC
-UMASK_DIV_AR                                          0x81 
-UMASK_DIV_S                                           0x01 
+UMASK_DIV_AR                                          0x81
+UMASK_DIV_S                                           0x01
 
 EVENT_EIST_TRANS               0x3A  PMC
 UMASK_EIST_TRANS               0x00
 
 EVENT_EXT_SNOOP                                       0x77 PMC
-UMASK_EXT_SNOOP_ALL_CORES_MODIFIED                    0xC8  
-UMASK_EXT_SNOOP_ALL_CORES_EXCLUSIVE                   0xC4  
-UMASK_EXT_SNOOP_ALL_CORES_SHARED                      0xC2  
-UMASK_EXT_SNOOP_ALL_CORES_INVALID                     0xC1  
-UMASK_EXT_SNOOP_ALL_CORES_MESI                        0xCF  
-UMASK_EXT_SNOOP_THIS_CORE_MODIFIED                    0x48  
-UMASK_EXT_SNOOP_THIS_CORE_EXCLUSIVE                   0x44  
-UMASK_EXT_SNOOP_THIS_CORE_SHARED                      0x42  
-UMASK_EXT_SNOOP_THIS_CORE_INVALID                     0x41  
-UMASK_EXT_SNOOP_THIS_CORE_MESI                        0x4F  
+UMASK_EXT_SNOOP_ALL_CORES_MODIFIED                    0xC8
+UMASK_EXT_SNOOP_ALL_CORES_EXCLUSIVE                   0xC4
+UMASK_EXT_SNOOP_ALL_CORES_SHARED                      0xC2
+UMASK_EXT_SNOOP_ALL_CORES_INVALID                     0xC1
+UMASK_EXT_SNOOP_ALL_CORES_MESI                        0xCF
+UMASK_EXT_SNOOP_THIS_CORE_MODIFIED                    0x48
+UMASK_EXT_SNOOP_THIS_CORE_EXCLUSIVE                   0x44
+UMASK_EXT_SNOOP_THIS_CORE_SHARED                      0x42
+UMASK_EXT_SNOOP_THIS_CORE_INVALID                     0x41
+UMASK_EXT_SNOOP_THIS_CORE_MESI                        0x4F
 
 EVENT_FP_ASSIST                                       0x11 PMC
-UMASK_FP_ASSIST                                       0x01 
-UMASK_FP_ASSIST_AR                                    0x81 
+UMASK_FP_ASSIST                                       0x01
+UMASK_FP_ASSIST_AR                                    0x81
 
 EVENT_HW_INT_RCV             0xC8   PMC
 UMASK_HW_INT_RCV    0x00
 
 EVENT_ICACHE                                         0x80  PMC
-UMASK_ICACHE_ACCESSES                                0x03 
-UMASK_ICACHE_MISSES                                  0x02 
+UMASK_ICACHE_ACCESSES                                0x03
+UMASK_ICACHE_MISSES                                  0x02
 
 EVENT_INST_RETIRED               0xC0   PMC
 UMASK_INST_RETIRED_ANY_P         0x00
 
 EVENT_ITLB                                            0x82 PMC
-UMASK_ITLB_MISSES                                     0x02 
-UMASK_ITLB_FLUSH                                      0x04 
+UMASK_ITLB_MISSES                                     0x02
+UMASK_ITLB_FLUSH                                      0x04
 
 EVENT_L1D_CACHE                                       0x40 PMC
 UMASK_L1D_CACHE_LD                                    0x21
-UMASK_L1D_CACHE_ST                                    0x22 
+UMASK_L1D_CACHE_ST                                    0x22
 
 EVENT_L2_ADS                     0x21   PMC
 UMASK_L2_ADS_ALL_CORES           0xC0
@@ -397,18 +397,18 @@ UMASK_L2_ST_THIS_CORE_INVALID    0x41
 UMASK_L2_ST_THIS_CORE_MESI       0x4F
 
 EVENT_MACHINE_CLEARS                                  0xC3 PMC
-UMASK_MACHINE_CLEARS_SMC                              0x01 
+UMASK_MACHINE_CLEARS_SMC                              0x01
 
 EVENT_MACRO_INSTS          0xAA  PMC
 UMASK_MACRO_INSTS_DECODED          0x01
 UMASK_MACRO_INSTS_CISC_DECODED     0x08
 
 EVENT_MEM_LOAD_RETIRED           0xCB    PMC0
-UMASK_MEM_LOAD_RETIRED_L1D_MISS       0x01 
-UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS  0x02 
-UMASK_MEM_LOAD_RETIRED_L2_MISS        0x04 
-UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS   0x08 
-UMASK_MEM_LOAD_RETIRED_DTLB_MISS      0x10 
+UMASK_MEM_LOAD_RETIRED_L1D_MISS       0x01
+UMASK_MEM_LOAD_RETIRED_L1D_LINE_MISS  0x02
+UMASK_MEM_LOAD_RETIRED_L2_MISS        0x04
+UMASK_MEM_LOAD_RETIRED_L2_LINE_MISS   0x08
+UMASK_MEM_LOAD_RETIRED_DTLB_MISS      0x10
 
 EVENT_MUL                        0x12   PMC1
 UMASK_MUL                        0x00
@@ -420,9 +420,9 @@ UMASK_PAGE_WALKS_WALKS            0x03
 UMASK_PAGE_WALKS_CYCLES           0x02
 
 EVENT_PREFETCH                                        0x07 PMC
-UMASK_PREFETCH_PREFETCHNTA                            0x08 
-UMASK_PREFETCH_PREFETCHT0                             0x01 
-UMASK_PREFETCH_SW_L2                                  0x06 
+UMASK_PREFETCH_PREFETCHNTA                            0x08
+UMASK_PREFETCH_PREFETCHT0                             0x01
+UMASK_PREFETCH_SW_L2                                  0x06
 
 EVENT_SEGMENT_REG_LOADS          0x06   PMC
 UMASK_SEGMENT_REG_LOADS_ANY          0x00
@@ -438,7 +438,7 @@ UMASK_SIMD_COMP_INST_RETIRED_PACKED_SINGLE     0x01
 UMASK_SIMD_COMP_INST_RETIRED_SCALAR_SINGLE     0x02
 UMASK_SIMD_COMP_INST_RETIRED_PACKED_DOUBLE     0x04
 UMASK_SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE     0x08
-UMASK_SIMD_COMP_INST_RETIRED_ANY               0x1F 
+UMASK_SIMD_COMP_INST_RETIRED_ANY               0x1F
 
 EVENT_SIMD_INST_RETIRED          0xC7   PMC
 UMASK_SIMD_INST_RETIRED_PACKED_SINGLE       0x01
@@ -453,25 +453,25 @@ UMASK_SIMD_SAT_INSTR_RETIRED    0x00
 
 EVENT_SIMD_SAT_UOPS_EXEC            0xB1  PMC
 UMASK_SIMD_SAT_UOPS_EXEC_S        0x00
-UMASK_SIMD_SAT_UOP_EXEC_AR                            0x80 
+UMASK_SIMD_SAT_UOP_EXEC_AR                            0x80
 
 EVENT_SIMD_UOPS_EXEC            0xB0  PMC
 UMASK_SIMD_UOPS_EXEC_S          0x00
-UMASK_SIMD_UOPS_EXEC_AR                               0xB0 0x80 
+UMASK_SIMD_UOPS_EXEC_AR                               0xB0 0x80
 
 EVENT_SIMD_UOP_TYPE_EXEC               0xB3  PMC
-UMASK_SIMD_UOP_TYPE_EXEC_ARITHMETIC_AR                0xA0 
-UMASK_SIMD_UOP_TYPE_EXEC_ARITHMETIC_S                 0x20 
-UMASK_SIMD_UOP_TYPE_EXEC_LOGICAL_AR                   0x90 
-UMASK_SIMD_UOP_TYPE_EXEC_LOGICAL_S                    0x10 
-UMASK_SIMD_UOP_TYPE_EXEC_MUL_AR                       0x81 
-UMASK_SIMD_UOP_TYPE_EXEC_MUL_S                        0x01 
-UMASK_SIMD_UOP_TYPE_EXEC_PACK_AR                      0x84 
-UMASK_SIMD_UOP_TYPE_EXEC_PACK_S                       0x04 
-UMASK_SIMD_UOP_TYPE_EXEC_SHIFT_AR                     0x82 
-UMASK_SIMD_UOP_TYPE_EXEC_SHIFT_SSIMD                  0x02 
-UMASK_SIMD_UOP_TYPE_EXEC_UNPACK_AR                    0x88 
-UMASK_SIMD_UOP_TYPE_EXEC_UNPACK_S                     0x08 
+UMASK_SIMD_UOP_TYPE_EXEC_ARITHMETIC_AR                0xA0
+UMASK_SIMD_UOP_TYPE_EXEC_ARITHMETIC_S                 0x20
+UMASK_SIMD_UOP_TYPE_EXEC_LOGICAL_AR                   0x90
+UMASK_SIMD_UOP_TYPE_EXEC_LOGICAL_S                    0x10
+UMASK_SIMD_UOP_TYPE_EXEC_MUL_AR                       0x81
+UMASK_SIMD_UOP_TYPE_EXEC_MUL_S                        0x01
+UMASK_SIMD_UOP_TYPE_EXEC_PACK_AR                      0x84
+UMASK_SIMD_UOP_TYPE_EXEC_PACK_S                       0x04
+UMASK_SIMD_UOP_TYPE_EXEC_SHIFT_AR                     0x82
+UMASK_SIMD_UOP_TYPE_EXEC_SHIFT_SSIMD                  0x02
+UMASK_SIMD_UOP_TYPE_EXEC_UNPACK_AR                    0x88
+UMASK_SIMD_UOP_TYPE_EXEC_UNPACK_S                     0x08
 
 EVENT_SNOOP_STALL_DRV                                 0x7E PMC
 UMASK_SNOOP_STALL_DRV_ALL_CORES_THIS_A    0xC0
@@ -480,7 +480,7 @@ UMASK_SNOOP_STALL_DRV_THIS_CORE_THIS_A    0x40
 UMASK_SNOOP_STALL_DRV_THIS_CORE_ALL_A     0x60
 
 EVENT_STORE_FORWARDS_GOOD                             0x02  PMC
-UMASK_STORE_FORWARDS_GOOD 0x81 
+UMASK_STORE_FORWARDS_GOOD 0x81
 
 EVENT_THERMAL_TRIP               0x3B  PMC
 UMASK_THERMAL_TRIP               0xC0
@@ -489,5 +489,5 @@ EVENT_UOPS_RETIRED               0xC2   PMC
 UMASK_UOPS_RETIRED_ANY           0x00
 
 EVENT_X87_COMP_OPS_EXE                                0x10 PMC
-UMASK_X87_COMP_OPS_EXE_ANY_S                          0x01 
-UMASK_X87_COMP_OPS_EXE_ANY_AR                         0x81 
+UMASK_X87_COMP_OPS_EXE_ANY_S                          0x01
+UMASK_X87_COMP_OPS_EXE_ANY_AR                         0x81
diff --git a/src/includes/perfmon_broadwellEP_events.txt b/src/includes/perfmon_broadwellEP_events.txt
index 90a521cc4..c02aed121 100644
--- a/src/includes/perfmon_broadwellEP_events.txt
+++ b/src/includes/perfmon_broadwellEP_events.txt
@@ -225,7 +225,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -546,22 +546,18 @@ UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES       EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES      EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES      EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL          EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES  EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
@@ -766,7 +762,7 @@ UMASK_TXR_INSERTS_BL_CORE           0x40
 EVENT_TXR_ADS_USED                  0x04 CBOX
 UMASK_TXR_ADS_USED_AD               0x01
 UMASK_TXR_ADS_USED_AK               0x02
-UMASK_TXR_ADS_USED_BL               0x04 
+UMASK_TXR_ADS_USED_BL               0x04
 
 EVENT_RING_BOUNCES                  0x05 CBOX
 UMASK_RING_BOUNCES_AD               0x01
@@ -1403,14 +1399,24 @@ UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
-UMASK_RPQ_CYCLES_NO_REG_CREDITS_ALL     0x0F
+
+EVENT_RPQ_CYCLES_NO_SPEC_CREDITS         0x16 BBOX
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
 
 EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_ALL     0x0F
+
+EVENT_WPQ_CYCLES_NO_SPEC_CREDITS         0x19 BBOX
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
 
 EVENT_SBO0_CREDITS_ACQUIRED             0x68 BBOX
 UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
@@ -1464,7 +1470,7 @@ EVENT_STALL_NO_SBO_CREDIT               0x6C BBOX
 UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
 UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
 UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
-UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x08
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
 
 EVENT_TAD_REQUESTS_G0                   0x1B BBOX
 UMASK_TAD_REQUESTS_G0_REGION0           0x01
@@ -2118,7 +2124,7 @@ EVENT_STALL_NO_SBO_CREDIT               0x2C PBOX0|PBOX1
 UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
 UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
 UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
-UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x08
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
 
 EVENT_CACHE_TOTAL_OCCUPANCY             0x12 IBOX
 UMASK_CACHE_TOTAL_OCCUPANCY_ANY         0x01
diff --git a/src/includes/perfmon_broadwell_events.txt b/src/includes/perfmon_broadwell_events.txt
index 16a05f22a..731da24b4 100644
--- a/src/includes/perfmon_broadwell_events.txt
+++ b/src/includes/perfmon_broadwell_events.txt
@@ -223,7 +223,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -544,22 +544,18 @@ UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES       EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES      EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES      EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL          EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES  EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
diff --git a/src/includes/perfmon_broadwelld_events.txt b/src/includes/perfmon_broadwelld_events.txt
index 0d8954aea..b5d1508cc 100644
--- a/src/includes/perfmon_broadwelld_events.txt
+++ b/src/includes/perfmon_broadwelld_events.txt
@@ -225,7 +225,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -545,22 +545,18 @@ UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
+DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES       EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES      EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
+DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES      EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL          EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
+DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES  EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_STALL_CYCLES     0x01
@@ -1338,11 +1334,23 @@ UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
 
+EVENT_RPQ_CYCLES_NO_SPEC_CREDITS         0x16 BBOX
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
+
 EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_SPEC_CREDITS         0x19 BBOX
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
 
 EVENT_SNOOPS_RSP_AFTER_DATA             0x0A BBOX
 UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL       0x01
@@ -2112,5 +2120,3 @@ UMASK_TXR_DATA_INSERTS_NCS              0x00
 
 EVENT_TXR_REQUEST_OCCUPANCY             0x0D IBOX
 UMASK_TXR_REQUEST_OCCUPANCY             0x00
-
-
diff --git a/src/includes/perfmon_cascadelakeX_events.txt b/src/includes/perfmon_cascadelakeX_events.txt
index ecdfebc96..21a5a54ab 100644
--- a/src/includes/perfmon_cascadelakeX_events.txt
+++ b/src/includes/perfmon_cascadelakeX_events.txt
@@ -205,10 +205,6 @@ DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTIO
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_USED_CYCLES      0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1
@@ -298,7 +294,7 @@ UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_DRAM  0x02
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_HITM  0x04
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_FWD   0x08
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_ALL   0x0E
-UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_REMOTE_PMM 0x10
+UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_PMM 0x10
 
 EVENT_MEM_LOAD_MISC_RETIRED             0xD4 PMC
 UMASK_MEM_LOAD_MISC_RETIRED_UC          0x04
@@ -549,11 +545,6 @@ UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD 0x10
 DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 0x10
 
-EVENT_LOCK_CYCLES_CACHE_LOCK            0x63 PMC
-UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION   0x02
-DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
-UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT      0x02
-
 EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0xB2 PMC
 UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0x01
 
@@ -1319,6 +1310,53 @@ UMASK_CAS_COUNT_RD_RMM              0x20
 UMASK_CAS_COUNT_RD_ISOCH            0x40
 UMASK_CAS_COUNT_WR_ISOCH            0x80
 
+EVENT_PMM_RPQ_OCCUPANCY             0xE0 MBOX
+UMASK_PMM_RPQ_OCCUPANCY_ALL         0x01
+
+EVENT_PMM_RPQ_INSERTS               0xE3 MBOX
+UMASK_PMM_RPQ_INSERTS               0x00
+
+EVENT_PMM_CMD1                      0xEA MBOX
+UMASK_PMM_CMD1_ALL                  0x01
+UMASK_PMM_CMD1_RD                   0x02
+UMASK_PMM_CMD1_WR                   0x04
+UMASK_PMM_CMD1_UFILL_RD             0x08
+
+EVENT_PMM_WPQ_OCCUPANCY             0xE4 MBOX
+UMASK_PMM_WPQ_OCCUPANCY_ALL         0x01
+
+EVENT_PMM_WPQ_INSERTS               0xE7 MBOX
+UMASK_PMM_WPQ_INSERTS               0x00
+
+EVENT_POWER_CHANNEL_PPD             0x85 MBOX
+UMASK_POWER_CHANNEL_PPD             0x00
+
+EVENT_POWER_SELF_REFRESH            0x43 MBOX
+UMASK_POWER_SELF_REFRESH            0x00
+
+EVENT_PRE_COUNT                     0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS           0x01
+UMASK_PRE_COUNT_RD                  0x04
+
+EVENT_RPQ_INSERTS                   0x10 MBOX
+UMASK_RPQ_INSERTS                   0x00
+
+EVENT_RPQ_OCCUPANCY                 0x80 MBOX
+UMASK_RPQ_OCCUPANCY                 0x00
+
+EVENT_M_TAGCHK                      0xD3 MBOX
+UMASK_M_TAGCHK_HIT                  0x01
+UMASK_M_TAGCHK_MISS_CLEAN           0x02
+UMASK_M_TAGCHK_MISS_DIRTY           0x04
+
+EVENT_WPQ_INSERTS                   0x20 MBOX
+UMASK_WPQ_INSERTS                   0x00
+
+EVENT_WPQ_OCCUPANCY                 0x81 MBOX
+UMASK_WPQ_OCCUPANCY                 0x00
+
+
+
 EVENT_DRAM_CLOCKTICKS               0x00 MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX|MBOX4FIX|MBOX5FIX
 UMASK_DRAM_CLOCKTICKS               0x00
 
@@ -1500,6 +1538,7 @@ UMASK_IMC_READS_NORMAL                  0x01
 UMASK_IMC_READS_ISOCH                   0x02
 UMASK_IMC_READS_ALL                     0x04
 UMASK_IMC_READS_TO_DDRT                 0x08
+UMASK_IMC_READS_TO_PMM                  0x08
 UMASK_IMC_READS_FROM_TRANSGRESS         0x10
 
 EVENT_IMC_WRITES                         0x38 M2M
@@ -1509,6 +1548,7 @@ UMASK_IMC_WRITES_FULL_ISOCH              0x04
 UMASK_IMC_WRITES_PARTIAL_ISOCH           0x08
 UMASK_IMC_WRITES_ALL                     0x10
 UMASK_IMC_WRITES_TO_DDRT                 0x20
+UMASK_IMC_WRITES_TO_PMM                  0x20
 UMASK_IMC_WRITES_FROM_TRANSGRESS         0x40
 UMASK_IMC_WRITES_NI                      0x80
 
@@ -2662,7 +2702,7 @@ UMASK_DATA_REQ_BY_CPU_CFG_READ_PART2     0x40 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_CFG_READ_PART3     0x40 0x03 0x08
 UMASK_DATA_REQ_BY_CPU_CFG_READ_VTD0      0x40 0x03 0x10
 UMASK_DATA_REQ_BY_CPU_CFG_READ_VTD1      0x40 0x03 0x20
-UMASK_DATA_REQ_BY_CPU_CFG_READ_PART0     0x40 0x03 0x01
+UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART0    0x10 0x03 0x01
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART1    0x10 0x03 0x02
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART2    0x10 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART3    0x10 0x03 0x08
@@ -2674,7 +2714,7 @@ UMASK_DATA_REQ_BY_CPU_IO_READ_PART2      0x80 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_IO_READ_PART3      0x80 0x03 0x08
 UMASK_DATA_REQ_BY_CPU_IO_READ_VTD0       0x80 0x03 0x10
 UMASK_DATA_REQ_BY_CPU_IO_READ_VTD1       0x80 0x03 0x20
-UMASK_DATA_REQ_BY_CPU_IO_READ_PART0      0x80 0x03 0x01
+UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART0      0x20 0x03 0x01
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART1     0x20 0x03 0x02
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART2     0x20 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART3     0x20 0x03 0x08
@@ -2789,7 +2829,7 @@ UMASK_TXN_REQ_BY_CPU_CFG_READ_PART2     0x40 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_CFG_READ_PART3     0x40 0x03 0x08
 UMASK_TXN_REQ_BY_CPU_CFG_READ_VTD0      0x40 0x03 0x10
 UMASK_TXN_REQ_BY_CPU_CFG_READ_VTD1      0x40 0x03 0x20
-UMASK_TXN_REQ_BY_CPU_CFG_READ_PART0     0x40 0x03 0x01
+UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART0    0x10 0x03 0x01
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART1    0x10 0x03 0x02
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART2    0x10 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART3    0x10 0x03 0x08
@@ -2801,7 +2841,7 @@ UMASK_TXN_REQ_BY_CPU_IO_READ_PART2      0x80 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_IO_READ_PART3      0x80 0x03 0x08
 UMASK_TXN_REQ_BY_CPU_IO_READ_VTD0       0x80 0x03 0x10
 UMASK_TXN_REQ_BY_CPU_IO_READ_VTD1       0x80 0x03 0x20
-UMASK_TXN_REQ_BY_CPU_IO_READ_PART0      0x80 0x03 0x01
+UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART0     0x20 0x03 0x01
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART1     0x20 0x03 0x02
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART2     0x20 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART3     0x20 0x03 0x08
diff --git a/src/includes/perfmon_haswellEP_events.txt b/src/includes/perfmon_haswellEP_events.txt
index 014a212aa..0cc60f705 100644
--- a/src/includes/perfmon_haswellEP_events.txt
+++ b/src/includes/perfmon_haswellEP_events.txt
@@ -253,7 +253,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -629,7 +629,9 @@ EVENT_BR_INST_RETIRED               0xC4  PMC
 UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
 UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
 UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+# This event is only usable with Intel PEBS. Intel PEBS is not supported by
+# LIKWID.
+#UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
 UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
 UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
 UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
@@ -1489,11 +1491,23 @@ UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
 UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
 
+EVENT_RPQ_CYCLES_NO_SPEC_CREDITS         0x16 BBOX
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
+
 EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
 UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
-UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_SPEC_CREDITS         0x19 BBOX
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN1    0x02
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN2    0x04
+UMASK_WPQ_CYCLES_NO_SPEC_CREDITS_CHN3    0x08
 
 EVENT_SBO0_CREDITS_ACQUIRED             0x68 BBOX
 UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index d835dcbae..17b8bf88e 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -254,7 +254,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -630,7 +630,9 @@ EVENT_BR_INST_RETIRED               0xC4  PMC
 UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
 UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
 UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+# This event is only usable with Intel PEBS. Intel PEBS is not supported by
+# LIKWID.
+#UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
 UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
 UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
 UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 6d2868713..0ac0522e1 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -565,7 +565,7 @@ UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_2          0x41
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_2      0x42
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_2   0x44
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_2         0x48
-UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_3           0x4F
+UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_2           0x4F
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_3          0x81
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_3      0x82
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_3   0x84
@@ -843,21 +843,3 @@ UMASK_UNC_L3_NON_CANCEL_READ_REQUESTS_RDBLKM_ALL             0xF4
 EVENT_UNC_L3_LATENCY                        0x4EF     UPMC
 UMASK_UNC_L3_LATENCY_CYCLE_COUNT            0x01
 UMASK_UNC_L3_LATENCY_REQUEST_COUNT          0x02
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/includes/perfmon_ivybridgeEP_events.txt b/src/includes/perfmon_ivybridgeEP_events.txt
index 1f4d523d0..6d347415e 100644
--- a/src/includes/perfmon_ivybridgeEP_events.txt
+++ b/src/includes/perfmon_ivybridgeEP_events.txt
@@ -207,7 +207,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -470,11 +470,7 @@ UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x20
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index 7f1f44ffc..7b2be2e82 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -208,7 +208,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -471,11 +471,7 @@ UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x20
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index ad151995d..7a189bb49 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -163,12 +163,12 @@ EVENT_GLOBAL_TLB_FLUSH           0x54   PMC
 UMASK_GLOBAL_TLB_FLUSH           0x00
 
 EVENT_GLOBAL_READ_BLOCK_OPS           0x62   PMC
-UMASK_GLOBAL_READ_BLOCK_OPS_READ      0x01   
-UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_MOD      0x02   
-UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_SHARED      0x04   
-UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC      0x08   
-UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC_MOD      0x10   
-UMASK_GLOBAL_READ_BLOCK_OPS_SPEC_SHARED      0x20   
+UMASK_GLOBAL_READ_BLOCK_OPS_READ      0x01
+UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_MOD      0x02
+UMASK_GLOBAL_READ_BLOCK_OPS_RDBLK_SHARED      0x04
+UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC      0x08
+UMASK_GLOBAL_READ_BLOCK_OPS_RDBLKSPEC_MOD      0x10
+UMASK_GLOBAL_READ_BLOCK_OPS_SPEC_SHARED      0x20
 
 EVENT_MEMORY_REQUESTS        0x65     PMC
 UMASK_MEMORY_REQUESTS_UC        0x01
@@ -181,10 +181,10 @@ UMASK_DATA_PREFETCHER_HIT_MAB         0x08
 
 #FIXME - Do we need to update mask details
 EVENT_MAB_REQ        0x68     PMC
-UMASK_MAB_REQ        0x00 
+UMASK_MAB_REQ        0x00
 
 EVENT_MAB_WAIT        0x69     PMC
-UMASK_MAB_WAIT        0x00 
+UMASK_MAB_WAIT        0x00
 
 EVENT_RESPONSE_ON_CACHE_REFILLS        0x6C     PMC
 UMASK_RESPONSE_ON_CACHE_REFILLS_EXCLUSIVE           0x01
@@ -465,7 +465,7 @@ UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_2          0x41
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_2      0x42
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_2   0x44
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_DIRTY_TO_2         0x48
-UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_3           0x4F
+UMASK_UNC_CPU_READ_CMD_LATENCY_A_ALL_TO_2           0x4F
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_TO_3          0x81
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_MOD_TO_3      0x82
 UMASK_UNC_CPU_READ_CMD_LATENCY_A_READ_SHARED_TO_3   0x84
@@ -651,5 +651,3 @@ UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT0_WRITE_TO_READ  0x04
 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_DIMM           0x08
 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_READ_TO_WRITE  0x10
 UMASK_MEMORY_CONTROLLER_TURNAROUNDS_DCT1_WRITE_TO_READ  0x20
-
-
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 132b10d01..02dfe25a4 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -83,10 +83,10 @@ UMASK_UOPS_ISSUED_ANY            0x01
 UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
-UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_CACHE_LOCAL_HOME_HIT   0x08 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                   0x10 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                    0x20 
+UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02
+UMASK_MEM_UNCORE_RETIRED_REMOTE_CACHE_LOCAL_HOME_HIT   0x08
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                   0x10
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                    0x20
 
 EVENT_FP_COMP_OPS_EXE            0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87        0x01
@@ -193,12 +193,12 @@ EVENT_L3_LAT_CACHE               0x2E   PMC
 UMASK_L3_LAT_CACHE_REFERENCE     0x4F
 UMASK_L3_LAT_CACHE_MISS          0x41
 
-EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
 UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
 UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
 
-EVENT_UOPS_DECODED                 0x3D
-UMASK_UOPS_DECODED_DEC0              0x01
+EVENT_UOPS_DECODED                0x3D PMC
+UMASK_UOPS_DECODED_DEC0           0x01
 
 EVENT_L1D_CACHE_LD               0x40   PMC0|PMC1
 UMASK_L1D_CACHE_LD_I_STATE       0x01
@@ -314,10 +314,10 @@ UMASK_BR_INST_EXEC_INDIRECT_NON_CALL     0x04
 UMASK_BR_INST_EXEC_NON_CALLS             0x07
 UMASK_BR_INST_EXEC_RETURN_NEAR           0x08
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL      0x10
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20 
-UMASK_BR_INST_EXEC_NEAR_CALLS            0x30 
-UMASK_BR_INST_EXEC_TAKEN                 0x40 
-UMASK_BR_INST_EXEC_ANY                   0x7F 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20
+UMASK_BR_INST_EXEC_NEAR_CALLS            0x30
+UMASK_BR_INST_EXEC_TAKEN                 0x40
+UMASK_BR_INST_EXEC_ANY                   0x7F
 
 EVENT_BR_MISP_EXEC                    0x89   PMC
 UMASK_BR_MISP_EXEC_COND               0x01
@@ -393,7 +393,9 @@ EVENT_BR_INST_RETIRED               0xC4  PMC
 UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
 UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
 UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+# This event is only usable with Intel PEBS. Intel PEBS is not supported by
+# LIKWID.
+#UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
 
 EVENT_BR_MISP_RETIRED               0xC5  PMC
 UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
@@ -468,9 +470,9 @@ UMASK_L2_TRANSACTIONS_RFO           0x02
 UMASK_L2_TRANSACTIONS_IFETCH        0x04
 UMASK_L2_TRANSACTIONS_PREFETCH      0x08
 UMASK_L2_TRANSACTIONS_L1D_WB        0x10
-UMASK_L2_TRANSACTIONS_L1D_FILL      0x20
-UMASK_L2_TRANSACTIONS_L1D_WB        0x40
-UMASK_L2_TRANSACTIONS_L1D_ANY       0x80
+UMASK_L2_TRANSACTIONS_FILL          0x20
+UMASK_L2_TRANSACTIONS_WB            0x40
+UMASK_L2_TRANSACTIONS_ANY           0x80
 
 EVENT_L2_LINES_IN                   0xF1   PMC
 UMASK_L2_LINES_IN_S_STATE           0x02
@@ -670,7 +672,6 @@ UMASK_DRAM_CMD_SOFT_RST                     0x17 0x02 0x00
 UMASK_DRAM_CMD_WR_CFG                       0x1C 0x02 0x00
 UMASK_DRAM_CMD_RD_CFG                       0x1D 0x02 0x00
 UMASK_DRAM_CMD_ZQCAL                        0x1E 0x02 0x00
-UMASK_DRAM_CMD_ALL                          0x00 0x02 0x00
 UMASK_DRAM_CMD_ALL_TRDOFF                   0x00 0x02 0x00
 UMASK_DRAM_CMD_ALL_RDPRIO                   0x00 0x02 0x01
 UMASK_DRAM_CMD_ALL_WRPRIO                   0x00 0x02 0x02
@@ -803,9 +804,6 @@ UMASK_PGT_PAGE_EV_CLS2OPN             0x01 0x0B 0x00
 EVENT_RETRIES                         0x0B  MBOX
 UMASK_RETRIES_ALL                     0x00 0x00 0x00
 
-EVENT_REFRESH                         0x06  MBOX
-UMASK_REFRESH                         0x00 0x00 0x00
-
 EVENT_REFRESH_CONFLICT                0x07  MBOX
 UMASK_REFRESH_CONFLICT                0x00 0x00 0x00
 
@@ -3049,21 +3047,6 @@ UMASK_U2R_REQUEST_CYCLES                         0x00
 EVENT_WOKEN                                      0xF8  UBOX
 UMASK_WOKEN                                      0x00
 
-EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_RBOX          0x01
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_BBOX          0x02
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_FULL_ALL           0x03
-
-EVENT_TO_R_B_HOM_MSGQ_CYCLES_NE                 0x06 SBOX
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_RBOX            0x01
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_BBOX            0x02
-UMASK_TO_R_B_HOM_MSGQ_CYCLES_NE_ALL             0x03
-
-EVENT_TO_R_B_HOM_MSGQ_OCCUPANCY                 0x07 SBOX
-UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_RBOX            0x01
-UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_BBOX            0x02
-UMASK_TO_R_B_HOM_MSGQ_OCCUPANCY_ALL             0x03
-
 EVENT_B2S_DRS_BYPASS                            0x53 SBOX
 UMASK_B2S_DRS_BYPASS                            0x00
 
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index 4d89d1594..bd780acf6 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -71,10 +71,10 @@ UMASK_DTLB_LOAD_MISSES_PDP_MISS        0x40
 UMASK_DTLB_LOAD_MISSES_LARGE_WALK_COMPLETED  0x80
 
 EVENT_MEMORY_DISAMBIGURATION      0x09   PMC
-UMASK_MEMORY_DISAMBIGURATION_RESET         0x01 
-UMASK_MEMORY_DISAMBIGURATION_SUCCESS       0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCHDOG      0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCH_CYCLES  0x01 
+UMASK_MEMORY_DISAMBIGURATION_RESET         0x01
+UMASK_MEMORY_DISAMBIGURATION_SUCCESS       0x01
+UMASK_MEMORY_DISAMBIGURATION_WATCHDOG      0x01
+UMASK_MEMORY_DISAMBIGURATION_WATCH_CYCLES  0x01
 
 EVENT_MEM_INST_RETIRED           0x0B  PMC
 UMASK_MEM_INST_RETIRED_LOADS     0x01
@@ -85,15 +85,15 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
-UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1 0x01 
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1 0x01
+UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
-UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_CACHE_LOCAL_HOME_HIT   0x08 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                   0x10 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                    0x20 
+UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02
+UMASK_MEM_UNCORE_RETIRED_REMOTE_CACHE_LOCAL_HOME_HIT   0x08
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                   0x10
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                    0x20
 
 EVENT_FP_COMP_OPS_EXE            0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87        0x01
@@ -199,12 +199,12 @@ EVENT_L3_LAT_CACHE               0x2E   PMC
 UMASK_L3_LAT_CACHE_REFERENCE     0x4F
 UMASK_L3_LAT_CACHE_MISS          0x41
 
-EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
-UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
-UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+EVENT_CPU_CLOCK_UNHALTED          0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P 0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P    0x01
 
-EVENT_UOPS_DECODED                 0x3D
-UMASK_UOPS_DECODED_DEC0              0x01
+EVENT_UOPS_DECODED                0x3D PMC
+UMASK_UOPS_DECODED_DEC0           0x01
 
 EVENT_L1D_CACHE_LD               0x40   PMC0|PMC1
 UMASK_L1D_CACHE_LD_I_STATE       0x01
@@ -320,10 +320,10 @@ UMASK_BR_INST_EXEC_INDIRECT_NON_CALL     0x04
 UMASK_BR_INST_EXEC_NON_CALLS             0x07
 UMASK_BR_INST_EXEC_RETURN_NEAR           0x08
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL      0x10
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20 
-UMASK_BR_INST_EXEC_NEAR_CALLS            0x30 
-UMASK_BR_INST_EXEC_TAKEN                 0x40 
-UMASK_BR_INST_EXEC_ANY                   0x7F 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20
+UMASK_BR_INST_EXEC_NEAR_CALLS            0x30
+UMASK_BR_INST_EXEC_TAKEN                 0x40
+UMASK_BR_INST_EXEC_ANY                   0x7F
 
 EVENT_BR_MISP_EXEC                    0x89   PMC
 UMASK_BR_MISP_EXEC_COND               0x01
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 74b5fe1be..d8e4b966f 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -250,63 +250,61 @@ UMASK_BTB_MISSES            0x00
 EVENT_BR_BOGUS             0xE4 PMC
 UMASK_BR_BOGUS            0x00
 
-EVENT_BACLEARS             0xE6 PMC
+EVENT_BACLEARS            0xE6 PMC
 UMASK_BACLEARS            0x00
 
-EVENT_RESOURCE_STALLS             0xA2 PMC
+EVENT_RESOURCE_STALLS            0xA2 PMC
 UMASK_RESOURCE_STALLS            0x00
 
-EVENT_PARTIAL_RAT_STALL             0xD2 PMC
+EVENT_PARTIAL_RAT_STALL            0xD2 PMC
 UMASK_PARTIAL_RAT_STALL            0x00
 
-EVENT_SEGMENT_REG_LOADS             0x06 PMC
+EVENT_SEGMENT_REG_LOADS            0x06 PMC
 UMASK_SEGMENT_REG_LOADS            0x00
 
-EVENT_CPU_CLK_UNHALTED             0x79 PMC
+EVENT_CPU_CLK_UNHALTED            0x79 PMC
 UMASK_CPU_CLK_UNHALTED            0x00
 
-EVENT_CPU_CLK_UNHALTED             0x79 PMC
-UMASK_CPU_CLK_UNHALTED            0x00
-
-EVENT_MMX_INSTR_EXEC             0xB0 PMC
+EVENT_MMX_INSTR_EXEC            0xB0 PMC
 UMASK_MMX_INSTR_EXEC            0x00
 
-EVENT_MMX_SAT_INSTR_EXEC             0xB1 PMC
+EVENT_MMX_SAT_INSTR_EXEC            0xB1 PMC
 UMASK_MMX_SAT_INSTR_EXEC            0x00
 
 EVENT_MMX_UOPS_EXEC             0xB2 PMC
-UMASK_MMX_UOPS_EXEC            0x0F
+UMASK_MMX_UOPS_EXEC             0x0F
 
-EVENT_MMX_INSTR_TYPE_EXEC             0xB3 PMC
-UMASK_MMX_INSTR_TYPE_EXEC_PACKED_MUL      0x01
-UMASK_MMX_INSTR_TYPE_EXEC_PACKED_SHIFT    0x02
-UMASK_MMX_INSTR_TYPE_EXEC_PACK            0x04
-UMASK_MMX_INSTR_TYPE_EXEC_UNPACK          0x08
-UMASK_MMX_INSTR_TYPE_EXEC_PACKED_LOGICAL          0x10
-UMASK_MMX_INSTR_TYPE_EXEC_PACKED_ARITHMETIC          0x20
+EVENT_MMX_INSTR_TYPE_EXEC                       0xB3 PMC
+UMASK_MMX_INSTR_TYPE_EXEC_PACKED_MUL            0x01
+UMASK_MMX_INSTR_TYPE_EXEC_PACKED_SHIFT          0x02
+UMASK_MMX_INSTR_TYPE_EXEC_PACK                  0x04
+UMASK_MMX_INSTR_TYPE_EXEC_UNPACK                0x08
+UMASK_MMX_INSTR_TYPE_EXEC_PACKED_LOGICAL        0x10
+UMASK_MMX_INSTR_TYPE_EXEC_PACKED_ARITHMETIC     0x20
 
 EVENT_FP_MMX_TRANS             0xCC PMC
-UMASK_FP_MMX_TRANS_MMX_FP            0x00
-UMASK_FP_MMX_TRANS_FP_MMX            0x01
+UMASK_FP_MMX_TRANS_MMX_FP      0x00
+UMASK_FP_MMX_TRANS_FP_MMX      0x01
 
 EVENT_MMX_ASSIST             0xCD PMC
-UMASK_MMX_ASSIST            0x00
+UMASK_MMX_ASSIST             0x00
 
-EVENT_MMX_INSTR_RET             0xCE PMC
-UMASK_MMX_INSTR_RET            0x00
+EVENT_MMX_INSTR_RET          0xCE PMC
+UMASK_MMX_INSTR_RET          0x00
 
-EVENT_SEG_RENAME_STALLS             0xD4 PMC
-UMASK_SEG_RENAME_STALLS_ES            0x02
-UMASK_SEG_RENAME_STALLS_DS            0x04
-UMASK_SEG_RENAME_STALLS_FS            0x08
-UMASK_SEG_RENAME_STALLS_ALL            0x0F
+EVENT_SEG_RENAME_STALLS               0xD4 PMC
+UMASK_SEG_RENAME_STALLS_ES            0x01
+UMASK_SEG_RENAME_STALLS_DS            0x02
+UMASK_SEG_RENAME_STALLS_FS            0x04
+UMASK_SEG_RENAME_STALLS_GS            0x04
+UMASK_SEG_RENAME_STALLS_ALL           0x0F
 
 EVENT_SEG_REG_RENAMES             0xD5 PMC
-UMASK_SEG_REG_RENAMES            0x02
-UMASK_SEG_REG_RENAMES            0x04
-UMASK_SEG_REG_RENAMES            0x08
-UMASK_SEG_REG_RENAMES            0x0F
+UMASK_SEG_REG_RENAMES_ES          0x01
+UMASK_SEG_REG_RENAMES_DS          0x02
+UMASK_SEG_REG_RENAMES_FS          0x04
+UMASK_SEG_REG_RENAMES_GS          0x08
+UMASK_SEG_REG_RENAMES_ALL         0x0F
 
 EVENT_RET_SEG_RENAMES             0xD6 PMC
-UMASK_RET_SEG_RENAMES            0x00
-
+UMASK_RET_SEG_RENAMES             0x00
diff --git a/src/includes/perfmon_perfevent.h b/src/includes/perfmon_perfevent.h
index f065b98b4..0fe5fc713 100644
--- a/src/includes/perfmon_perfevent.h
+++ b/src/includes/perfmon_perfevent.h
@@ -58,29 +58,34 @@ perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
     return ret;
 }
 
+int perfevent_paranoid_value()
+{
+    FILE* fd;
+    int paranoid = 3;
+    char buff[100];
+    fd = fopen("/proc/sys/kernel/perf_event_paranoid", "r");
+    if (fd == NULL)
+    {
+        fprintf(stderr, "ERROR: Linux kernel has no perf_event support\n");
+        fprintf(stderr, "ERROR: Cannot open file /proc/sys/kernel/perf_event_paranoid\n");
+        return paranoid;
+    }
+    size_t read = fread(buff, sizeof(char), 100, fd);
+    if (read > 0)
+    {
+        paranoid = atoi(buff);
+    }
+    fclose(fd);
+    return paranoid;
+}
+
 int perfmon_init_perfevent(int cpu_id)
 {
-    size_t read;
     int paranoid = -1;
-    char buff[100];
-    FILE* fd;
     if (!informed_paranoid)
     {
-        fd = fopen("/proc/sys/kernel/perf_event_paranoid", "r");
-        if (fd == NULL)
-        {
-            fprintf(stderr, "ERROR: Linux kernel has no perf_event support\n");
-            fprintf(stderr, "ERROR: Cannot open file /proc/sys/kernel/perf_event_paranoid\n");
-            fclose(fd);
-            exit(EXIT_FAILURE);
-        }
-        read = fread(buff, sizeof(char), 100, fd);
-        if (read > 0)
-        {
-            paranoid_level = atoi(buff);
-        }
-        fclose(fd);
-#if defined(__x86_64__) || defined(__i386__)
+        paranoid_level = perfevent_paranoid_value();
+#if defined(__x86_64__) || defined(__i386__) || defined(_ARCH_PPC)
         if (paranoid_level > 0 && getuid() != 0)
         {
             fprintf(stderr, "WARN: Linux kernel configured with paranoid level %d\n", paranoid_level);
@@ -97,6 +102,8 @@ int perfmon_init_perfevent(int cpu_id)
     }
     lock_acquire((int*) &tile_lock[affinity_thread2core_lookup[cpu_id]], cpu_id);
     lock_acquire((int*) &socket_lock[affinity_thread2socket_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &numa_lock[affinity_thread2numa_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]], cpu_id);
     if (cpu_event_fds == NULL)
     {
         cpu_event_fds = malloc(cpuid_topology.numHWThreads * sizeof(int*));
@@ -116,7 +123,7 @@ int perfmon_init_perfevent(int cpu_id)
     return 0;
 }
 
-int perf_fixed_setup(struct perf_event_attr *attr, PerfmonEvent *event)
+int perf_fixed_setup(struct perf_event_attr *attr, RegisterIndex index, PerfmonEvent *event)
 {
     int ret = -1;
     attr->type = PERF_TYPE_HARDWARE;
@@ -170,6 +177,10 @@ static char* perfEventOptionNames[] = {
     [EVENT_OPTION_OCCUPANCY_FILTER] = "occ_band0",
     [EVENT_OPTION_OCCUPANCY_EDGE] = "occ_edge",
     [EVENT_OPTION_OCCUPANCY_INVERT] = "occ_inv",
+#ifdef _ARCH_PPC
+    [EVENT_OPTION_PMC] = "pmc",
+    [EVENT_OPTION_PMCXSEL] = "pmcxsel",
+#endif
 };
 
 int getEventOptionConfig(char* base, EventOptionType type, PERF_EVENT_PMC_OPT_REGS *reg, int* start, int* end)
@@ -252,7 +263,7 @@ uint64_t create_mask(uint32_t value, int start, int end)
     return 0x0ULL;
 }
 
-int perf_pmc_setup(struct perf_event_attr *attr, PerfmonEvent *event)
+int perf_pmc_setup(struct perf_event_attr *attr, RegisterIndex index, PerfmonEvent *event)
 {
     uint64_t offcore_flags = 0x0ULL;
     PERF_EVENT_PMC_OPT_REGS reg = PERF_EVENT_INVAL_REG;
@@ -331,7 +342,23 @@ int perf_pmc_setup(struct perf_event_attr *attr, PerfmonEvent *event)
                 break;
         }
     }
-
+#ifdef _ARCH_PPC
+    getEventOptionConfig("/sys/devices/cpu", EVENT_OPTION_PMC, &reg, &start, &end);
+    switch(reg)
+    {
+        case PERF_EVENT_CONFIG_REG:
+            attr->config |= create_mask(getCounterTypeOffset(index)+1,start, end);
+            break;
+        case PERF_EVENT_CONFIG1_REG:
+            attr->config1 |= create_mask(getCounterTypeOffset(index)+1,start, end);
+            break;
+        case PERF_EVENT_CONFIG2_REG:
+            attr->config2 |= create_mask(getCounterTypeOffset(index)+1,start, end);
+            break;
+        default:
+            break;
+    }
+#endif
     return 0;
 }
 
@@ -452,6 +479,7 @@ int perfmon_setupCountersThread_perfevent(
     }
     for (int i=0;i < eventSet->numberOfEvents;i++)
     {
+        int has_lock = 0;
         is_uncore = 0;
         RegisterIndex index = eventSet->events[i].index;
         if (cpu_event_fds[cpu_id][index] != -1)
@@ -469,7 +497,7 @@ int perfmon_setupCountersThread_perfevent(
         switch (type)
         {
             case FIXED:
-                ret = perf_fixed_setup(&attr, event);
+                ret = perf_fixed_setup(&attr, index, event);
                 if (ret < 0)
                 {
                     continue;
@@ -477,7 +505,7 @@ int perfmon_setupCountersThread_perfevent(
                 VERBOSEPRINTREG(cpu_id, index, attr.config, SETUP_FIXED);
                 break;
             case PMC:
-                ret = perf_pmc_setup(&attr, event);
+                ret = perf_pmc_setup(&attr, index, event);
                 VERBOSEPRINTREG(cpu_id, index, attr.config, SETUP_PMC);
                 break;
             case POWER:
@@ -551,9 +579,34 @@ int perfmon_setupCountersThread_perfevent(
             case EUBOX5:
             case EUBOX6:
             case EUBOX7:
-                ret = perf_uncore_setup(&attr, type, event);
-                is_uncore = 1;
-                VERBOSEPRINTREG(cpu_id, index, attr.config, SETUP_UNCORE);
+
+                if (cpuid_info.family == ZEN_FAMILY && type == MBOX0)
+                {
+                    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+                    {
+                        has_lock = 1;
+                    }
+                }
+                else if (cpuid_info.family == ZEN_FAMILY && type == CBOX0)
+                {
+                    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
+                    {
+                        has_lock = 1;
+                    }
+                }
+                else
+                {
+                    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+                    {
+                        has_lock = 1;
+                    }
+                }
+                if (has_lock)
+                {
+                    ret = perf_uncore_setup(&attr, type, event);
+                    is_uncore = 1;
+                    VERBOSEPRINTREG(cpu_id, index, attr.config, SETUP_UNCORE);
+                }
                 break;
 #endif
             default:
@@ -561,8 +614,7 @@ int perfmon_setupCountersThread_perfevent(
         }
         if (ret == 0)
         {
-
-            if (!is_uncore || socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+            if (!is_uncore || has_lock)
             {
                 pid_t curpid = allpid;
                 if (is_uncore && curpid >= 0)
@@ -708,4 +760,3 @@ int perfmon_finalizeCountersThread_perfevent(int thread_id, PerfmonEventSet* eve
     }
     return 0;
 }
-
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 7c63f11d9..fce3a70db 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -416,17 +416,18 @@ EVENT_MMX_INSTR_RET             0xCE      PMC
 UMASK_MMX_INSTR_RET             0x00
 
 EVENT_SEG_RENAME_STALLS               0xD4      PMC
-UMASK_SEG_RENAME_STALLS_ES            0x02
-UMASK_SEG_RENAME_STALLS_DS            0x04
-UMASK_SEG_RENAME_STALLS_FS            0x08
+UMASK_SEG_RENAME_STALLS_ES            0x01
+UMASK_SEG_RENAME_STALLS_DS            0x02
+UMASK_SEG_RENAME_STALLS_FS            0x04
+UMASK_SEG_RENAME_STALLS_GS            0x04
 UMASK_SEG_RENAME_STALLS_ALL           0x0F
 
 EVENT_SEG_REG_RENAMES             0xD5      PMC
-UMASK_SEG_REG_RENAMES             0x02
-UMASK_SEG_REG_RENAMES             0x04
-UMASK_SEG_REG_RENAMES             0x08
-UMASK_SEG_REG_RENAMES             0x0F
+UMASK_SEG_REG_RENAMES_ES          0x01
+UMASK_SEG_REG_RENAMES_DS          0x02
+UMASK_SEG_REG_RENAMES_FS          0x04
+UMASK_SEG_REG_RENAMES_GS          0x08
+UMASK_SEG_REG_RENAMES_ALL         0x0F
 
 EVENT_RET_SEG_RENAMES             0xD6      PMC
 UMASK_RET_SEG_RENAMES             0x00
-
diff --git a/src/includes/perfmon_power8.h b/src/includes/perfmon_power8.h
new file mode 100644
index 000000000..10b8ca104
--- /dev/null
+++ b/src/includes/perfmon_power8.h
@@ -0,0 +1,12 @@
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+#include <perfmon_power8_counters.h>
+#include <perfmon_power8_events.h>
+
+static int perfmon_numCountersPower8 = NUM_COUNTERS_POWER8;
+static int perfmon_numCoreCountersPower8 = NUM_COUNTERS_POWER8;
+static int perfmon_numArchEventsPower8 = NUM_ARCH_EVENTS_POWER8;
+
diff --git a/src/includes/perfmon_power8_counters.h b/src/includes/perfmon_power8_counters.h
new file mode 100644
index 000000000..8e39f1f39
--- /dev/null
+++ b/src/includes/perfmon_power8_counters.h
@@ -0,0 +1,21 @@
+
+
+#define NUM_COUNTERS_POWER8 7
+
+static RegisterMap power8_counter_map[NUM_COUNTERS_POWER8] = {
+    {"PMC0", PMC0, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC1", PMC1, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC2", PMC2, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC3", PMC3, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC4", PMC4, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC5", PMC5, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PURR", PMC6, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+static BoxMap power8_box_map[NUM_UNITS] = {
+    [PMC] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+};
+
+static char* power8_translate_types[NUM_UNITS] = {
+    [PMC] = "/sys/bus/event_source/devices/cpu",
+};
diff --git a/src/includes/perfmon_power8_events.txt b/src/includes/perfmon_power8_events.txt
new file mode 100644
index 000000000..a15b1a52d
--- /dev/null
+++ b/src/includes/perfmon_power8_events.txt
@@ -0,0 +1,2987 @@
+EVENT_PM_1LPAR_CYC 0x5E PMC0
+UMASK_PM_1LPAR_CYC 0x0F
+
+EVENT_PM_1PLUS_PPC_CMPL 0xF2 PMC0
+UMASK_PM_1PLUS_PPC_CMPL 0x00
+
+EVENT_PM_1PLUS_PPC_DISP 0xF2 PMC3
+UMASK_PM_1PLUS_PPC_DISP 0x00
+
+EVENT_PM_2LPAR_CYC 0x6E PMC1
+UMASK_PM_2LPAR_CYC 0x00
+
+EVENT_PM_4LPAR_CYC 0x5E PMC3
+UMASK_PM_4LPAR_CYC 0x0E
+
+EVENT_PM_ALL_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_ALL_CHIP_PUMP_CPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_ALL_GRP_PUMP_CPRED 0x00 0x05 0x00
+
+EVENT_PM_ALL_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_ALL_GRP_PUMP_MPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_ALL_GRP_PUMP_MPRED_RTY 0x00 0x06 0x00
+
+EVENT_PM_ALL_PUMP_CPRED 0x54 PMC0
+UMASK_PM_ALL_PUMP_CPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_PUMP_MPRED 0x52 PMC3
+UMASK_PM_ALL_PUMP_MPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_ALL_SYS_PUMP_CPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_ALL_SYS_PUMP_MPRED 0x00 0x06 0x00
+
+EVENT_PM_ALL_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_ALL_SYS_PUMP_MPRED_RTY 0x00 0x06 0x00
+
+EVENT_PM_ANY_THRD_RUN_CYC 0xFA PMC0
+UMASK_PM_ANY_THRD_RUN_CYC 0x00
+
+EVENT_PM_BACK_BR_CMPL 0x5E PMC1
+UMASK_PM_BACK_BR_CMPL 0x05
+
+EVENT_PM_BANK_CONFLICT 0x82 PMC
+UMASK_PM_BANK_CONFLICT 0x04
+
+EVENT_PM_BRU_FIN 0x68 PMC0
+UMASK_PM_BRU_FIN 0x00
+
+EVENT_PM_BR_2PATH 0x36 PMC1|PMC3
+UMASK_PM_BR_2PATH 0x00
+
+EVENT_PM_BR_BC_8 0x86 PMC
+UMASK_PM_BR_BC_8 0x05
+
+EVENT_PM_BR_BC_8_CONV 0x84 PMC
+UMASK_PM_BR_BC_8_CONV 0x05
+
+EVENT_PM_BR_CMPL 0x60 PMC3
+UMASK_PM_BR_CMPL 0x00
+
+EVENT_PM_BR_MPRED_CCACHE 0xAC PMC
+UMASK_PM_BR_MPRED_CCACHE 0x04
+
+EVENT_PM_BR_MPRED_CMPL 0xF6 PMC3
+UMASK_PM_BR_MPRED_CMPL 0x00
+
+EVENT_PM_BR_MPRED_CR 0xB8 PMC
+UMASK_PM_BR_MPRED_CR 0x04
+
+EVENT_PM_BR_MPRED_LSTACK 0xAE PMC
+UMASK_PM_BR_MPRED_LSTACK 0x04
+
+EVENT_PM_BR_MPRED_TA 0xBA PMC
+UMASK_PM_BR_MPRED_TA 0x04
+
+EVENT_PM_BR_MRK_2PATH 0x38 PMC0
+UMASK_PM_BR_MRK_2PATH 0x00 0x20 0x00
+
+EVENT_PM_BR_PRED_BR0 0x9C PMC
+UMASK_PM_BR_PRED_BR0 0x04
+
+EVENT_PM_BR_PRED_BR1 0x9E PMC
+UMASK_PM_BR_PRED_BR1 0x04
+
+EVENT_PM_BR_PRED_BR_CMPL 0x9C PMC
+UMASK_PM_BR_PRED_BR_CMPL 0x04 0x10 0x00
+
+EVENT_PM_BR_PRED_CCACHE_BR0 0xA4 PMC
+UMASK_PM_BR_PRED_CCACHE_BR0 0x04
+
+EVENT_PM_BR_PRED_CCACHE_BR1 0xA6 PMC
+UMASK_PM_BR_PRED_CCACHE_BR1 0x04
+
+EVENT_PM_BR_PRED_CCACHE_CMPL 0xA4 PMC
+UMASK_PM_BR_PRED_CCACHE_CMPL 0x04 0x10 0x00
+
+EVENT_PM_BR_PRED_CR_BR0 0xB0 PMC
+UMASK_PM_BR_PRED_CR_BR0 0x04
+
+EVENT_PM_BR_PRED_CR_BR1 0xB2 PMC
+UMASK_PM_BR_PRED_CR_BR1 0x04
+
+EVENT_PM_BR_PRED_CR_CMPL 0xB0 PMC
+UMASK_PM_BR_PRED_CR_CMPL 0x04 0x10 0x00
+
+EVENT_PM_BR_PRED_LSTACK_BR0 0xA8 PMC
+UMASK_PM_BR_PRED_LSTACK_BR0 0x04
+
+EVENT_PM_BR_PRED_LSTACK_BR1 0xAA PMC
+UMASK_PM_BR_PRED_LSTACK_BR1 0x04
+
+EVENT_PM_BR_PRED_LSTACK_CMPL 0xA8 PMC
+UMASK_PM_BR_PRED_LSTACK_CMPL 0x04 0x10 0x00
+
+EVENT_PM_BR_PRED_TA_BR0 0xB4 PMC
+UMASK_PM_BR_PRED_TA_BR0 0x04
+
+EVENT_PM_BR_PRED_TA_BR1 0xB6 PMC
+UMASK_PM_BR_PRED_TA_BR1 0x04
+
+EVENT_PM_BR_PRED_TA_CMPL 0xB4 PMC
+UMASK_PM_BR_PRED_TA_CMPL 0x04 0x10 0x00
+
+EVENT_PM_BR_TAKEN_CMPL 0xFA PMC1
+UMASK_PM_BR_TAKEN_CMPL 0x00
+
+EVENT_PM_BR_UNCOND_BR0 0xA0 PMC
+UMASK_PM_BR_UNCOND_BR0 0x04
+
+EVENT_PM_BR_UNCOND_BR1 0xA2 PMC
+UMASK_PM_BR_UNCOND_BR1 0x04
+
+EVENT_PM_BR_UNCOND_CMPL 0xA0 PMC
+UMASK_PM_BR_UNCOND_CMPL 0x04 0x10 0x00
+
+EVENT_PM_CASTOUT_ISSUED 0x94 PMC
+UMASK_PM_CASTOUT_ISSUED 0x03
+
+EVENT_PM_CASTOUT_ISSUED_GPR 0x96 PMC
+UMASK_PM_CASTOUT_ISSUED_GPR 0x03
+
+EVENT_PM_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_CHIP_PUMP_CPRED 0x00
+
+EVENT_PM_CLB_HELD 0x90 PMC
+UMASK_PM_CLB_HELD 0x02
+
+EVENT_PM_CMPLU_STALL 0x0A PMC3
+UMASK_PM_CMPLU_STALL 0x00
+
+EVENT_PM_CMPLU_STALL_ALT 0x54 PMC0
+UMASK_PM_CMPLU_STALL_ALT 0x0E
+
+EVENT_PM_CMPLU_STALL_BRU 0x18 PMC3
+UMASK_PM_CMPLU_STALL_BRU 0x0D
+
+EVENT_PM_CMPLU_STALL_BRU_CRU 0x18 PMC1
+UMASK_PM_CMPLU_STALL_BRU_CRU 0x0D
+
+EVENT_PM_CMPLU_STALL_COQ_FULL 0x26 PMC2
+UMASK_PM_CMPLU_STALL_COQ_FULL 0x00
+
+EVENT_PM_CMPLU_STALL_DCACHE_MISS 0x12 PMC1
+UMASK_PM_CMPLU_STALL_DCACHE_MISS 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_L21_L31 0x18 PMC1
+UMASK_PM_CMPLU_STALL_DMISS_L21_L31 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_L2L3 0x16 PMC1
+UMASK_PM_CMPLU_STALL_DMISS_L2L3 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_L2L3_CONFLICT 0x16 PMC3
+UMASK_PM_CMPLU_STALL_DMISS_L2L3_CONFLICT 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_L3MISS 0x1A PMC3
+UMASK_PM_CMPLU_STALL_DMISS_L3MISS 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_LMEM 0x18 PMC3
+UMASK_PM_CMPLU_STALL_DMISS_LMEM 0x0C
+
+EVENT_PM_CMPLU_STALL_DMISS_REMOTE 0x1C PMC1
+UMASK_PM_CMPLU_STALL_DMISS_REMOTE 0x0C
+
+EVENT_PM_CMPLU_STALL_ERAT_MISS 0x12 PMC3
+UMASK_PM_CMPLU_STALL_ERAT_MISS 0x0C
+
+EVENT_PM_CMPLU_STALL_FLUSH 0x38 PMC2
+UMASK_PM_CMPLU_STALL_FLUSH 0x00
+
+EVENT_PM_CMPLU_STALL_FXLONG 0x16 PMC3
+UMASK_PM_CMPLU_STALL_FXLONG 0x0D
+
+EVENT_PM_CMPLU_STALL_FXU 0x16 PMC1
+UMASK_PM_CMPLU_STALL_FXU 0x0D
+
+EVENT_PM_CMPLU_STALL_HWSYNC 0x36 PMC2
+UMASK_PM_CMPLU_STALL_HWSYNC 0x00
+
+EVENT_PM_CMPLU_STALL_LOAD_FINISH 0x14 PMC3
+UMASK_PM_CMPLU_STALL_LOAD_FINISH 0x0D
+
+EVENT_PM_CMPLU_STALL_LSU 0x10 PMC1
+UMASK_PM_CMPLU_STALL_LSU 0x0C
+
+EVENT_PM_CMPLU_STALL_LWSYNC 0x36 PMC0
+UMASK_PM_CMPLU_STALL_LWSYNC 0x00
+
+EVENT_PM_CMPLU_STALL_MEM_ECC_DELAY 0x28 PMC2
+UMASK_PM_CMPLU_STALL_MEM_ECC_DELAY 0x00
+
+EVENT_PM_CMPLU_STALL_NO_NTF 0x1C PMC1
+UMASK_PM_CMPLU_STALL_NO_NTF 0x0E
+
+EVENT_PM_CMPLU_STALL_NTCG_FLUSH 0x1E PMC1
+UMASK_PM_CMPLU_STALL_NTCG_FLUSH 0x0E
+
+EVENT_PM_CMPLU_STALL_OTHER_CMPL 0x06 PMC2
+UMASK_PM_CMPLU_STALL_OTHER_CMPL 0x00
+
+EVENT_PM_CMPLU_STALL_REJECT 0x10 PMC3
+UMASK_PM_CMPLU_STALL_REJECT 0x0C
+
+EVENT_PM_CMPLU_STALL_REJECT_LHS 0x1A PMC1
+UMASK_PM_CMPLU_STALL_REJECT_LHS 0x0C
+
+EVENT_PM_CMPLU_STALL_REJ_LMQ_FULL 0x14 PMC3
+UMASK_PM_CMPLU_STALL_REJ_LMQ_FULL 0x0C
+
+EVENT_PM_CMPLU_STALL_SCALAR 0x10 PMC3
+UMASK_PM_CMPLU_STALL_SCALAR 0x0D
+
+EVENT_PM_CMPLU_STALL_SCALAR_LONG 0x10 PMC1
+UMASK_PM_CMPLU_STALL_SCALAR_LONG 0x0D
+
+EVENT_PM_CMPLU_STALL_STORE 0x14 PMC1
+UMASK_PM_CMPLU_STALL_STORE 0x0C
+
+EVENT_PM_CMPLU_STALL_ST_FWD 0x1C PMC3
+UMASK_PM_CMPLU_STALL_ST_FWD 0x0C
+
+EVENT_PM_CMPLU_STALL_THRD 0x1C PMC0
+UMASK_PM_CMPLU_STALL_THRD 0x00
+
+EVENT_PM_CMPLU_STALL_VECTOR 0x14 PMC1
+UMASK_PM_CMPLU_STALL_VECTOR 0x0D
+
+EVENT_PM_CMPLU_STALL_VECTOR_LONG 0x12 PMC3
+UMASK_PM_CMPLU_STALL_VECTOR_LONG 0x0D
+
+EVENT_PM_CMPLU_STALL_VSU 0x12 PMC1
+UMASK_PM_CMPLU_STALL_VSU 0x0D
+
+EVENT_PM_CO0_ALLOC 0x83 PMC0
+UMASK_PM_CO0_ALLOC 0x06
+
+EVENT_PM_CO0_BUSY 0x82 PMC0
+UMASK_PM_CO0_BUSY 0x06
+
+EVENT_PM_CO0_BUSY_ALT 0x82 PMC0
+UMASK_PM_CO0_BUSY_ALT 0x06 0x07 0x00
+
+EVENT_PM_CO_USAGE 0x8A PMC2
+UMASK_PM_CO_USAGE 0x06
+
+EVENT_PM_CO_USAGE_ALT 0x8A PMC2
+UMASK_PM_CO_USAGE_ALT 0x06 0x07 0x00
+
+EVENT_PM_CRU_FIN 0x66 PMC3
+UMASK_PM_CRU_FIN 0x00
+
+EVENT_PM_CYC 0x1E PMC
+UMASK_PM_CYC 0x00
+
+EVENT_PM_CYC_ALT 0xF0 PMC0
+UMASK_PM_CYC_ALT 0x00
+
+EVENT_PM_DATA_ALL_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_DATA_ALL_CHIP_PUMP_CPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_DATA_ALL_FROM_DL2L3_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_DATA_ALL_FROM_DL2L3_SHR 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_DL4 0x4C PMC2
+UMASK_PM_DATA_ALL_FROM_DL4 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_DMEM 0x4C PMC3
+UMASK_PM_DATA_ALL_FROM_DMEM 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2 0x42 PMC0
+UMASK_PM_DATA_ALL_FROM_L2 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_DATA_ALL_FROM_L21_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_DATA_ALL_FROM_L21_SHR 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2MISS_MOD 0x4E PMC0
+UMASK_PM_DATA_ALL_FROM_L2MISS_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_DATA_ALL_FROM_L2_DISP_CONFLICT_LDHITST 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_DATA_ALL_FROM_L2_DISP_CONFLICT_OTHER 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_DATA_ALL_FROM_L2_MEPF 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_DATA_ALL_FROM_L2_NO_CONFLICT 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L3 0x42 PMC3
+UMASK_PM_DATA_ALL_FROM_L3 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_DATA_ALL_FROM_L31_ECO_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_DATA_ALL_FROM_L31_ECO_SHR 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_DATA_ALL_FROM_L31_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_DATA_ALL_FROM_L31_SHR 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_DATA_ALL_FROM_L3MISS_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_DATA_ALL_FROM_L3_DISP_CONFLICT 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_DATA_ALL_FROM_L3_MEPF 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_DATA_ALL_FROM_L3_NO_CONFLICT 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_LL4 0x4C PMC0
+UMASK_PM_DATA_ALL_FROM_LL4 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_LMEM 0x48 PMC1
+UMASK_PM_DATA_ALL_FROM_LMEM 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_MEMORY 0x4C PMC1
+UMASK_PM_DATA_ALL_FROM_MEMORY 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_DATA_ALL_FROM_OFF_CHIP_CACHE 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_DATA_ALL_FROM_ON_CHIP_CACHE 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_DATA_ALL_FROM_RL2L3_MOD 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_DATA_ALL_FROM_RL2L3_SHR 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_RL4 0x4A PMC1
+UMASK_PM_DATA_ALL_FROM_RL4 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_FROM_RMEM 0x4A PMC2
+UMASK_PM_DATA_ALL_FROM_RMEM 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_DATA_ALL_GRP_PUMP_CPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_DATA_ALL_GRP_PUMP_MPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_DATA_ALL_GRP_PUMP_MPRED_RTY 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_PUMP_CPRED 0x54 PMC0
+UMASK_PM_DATA_ALL_PUMP_CPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_PUMP_MPRED 0x52 PMC3
+UMASK_PM_DATA_ALL_PUMP_MPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_DATA_ALL_SYS_PUMP_CPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_DATA_ALL_SYS_PUMP_MPRED 0x0C 0x06 0x00
+
+EVENT_PM_DATA_ALL_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_DATA_ALL_SYS_PUMP_MPRED_RTY 0x0C 0x06 0x00
+
+EVENT_PM_DATA_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_DATA_CHIP_PUMP_CPRED 0x0C
+
+EVENT_PM_DATA_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_DATA_FROM_DL2L3_MOD 0x0C
+
+EVENT_PM_DATA_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_DATA_FROM_DL2L3_SHR 0x0C
+
+EVENT_PM_DATA_FROM_DL4 0x4C PMC2
+UMASK_PM_DATA_FROM_DL4 0x0C
+
+EVENT_PM_DATA_FROM_DMEM 0x4C PMC3
+UMASK_PM_DATA_FROM_DMEM 0x0C
+
+EVENT_PM_DATA_FROM_L2 0x42 PMC0
+UMASK_PM_DATA_FROM_L2 0x0C
+
+EVENT_PM_DATA_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_DATA_FROM_L21_MOD 0x0C
+
+EVENT_PM_DATA_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_DATA_FROM_L21_SHR 0x0C
+
+EVENT_PM_DATA_FROM_L2MISS 0xFE PMC1
+UMASK_PM_DATA_FROM_L2MISS 0x00
+
+EVENT_PM_DATA_FROM_L2MISS_MOD 0x4E PMC0
+UMASK_PM_DATA_FROM_L2MISS_MOD 0x0C
+
+EVENT_PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0x0C
+
+EVENT_PM_DATA_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_DATA_FROM_L2_DISP_CONFLICT_OTHER 0x0C
+
+EVENT_PM_DATA_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_DATA_FROM_L2_MEPF 0x0C
+
+EVENT_PM_DATA_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_DATA_FROM_L2_NO_CONFLICT 0x0C
+
+EVENT_PM_DATA_FROM_L3 0x42 PMC3
+UMASK_PM_DATA_FROM_L3 0x0C
+
+EVENT_PM_DATA_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_DATA_FROM_L31_ECO_MOD 0x0C
+
+EVENT_PM_DATA_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_DATA_FROM_L31_ECO_SHR 0x0C
+
+EVENT_PM_DATA_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_DATA_FROM_L31_MOD 0x0C
+
+EVENT_PM_DATA_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_DATA_FROM_L31_SHR 0x0C
+
+EVENT_PM_DATA_FROM_L3MISS 0xFE PMC2
+UMASK_PM_DATA_FROM_L3MISS 0x00
+
+EVENT_PM_DATA_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_DATA_FROM_L3MISS_MOD 0x0C
+
+EVENT_PM_DATA_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_DATA_FROM_L3_DISP_CONFLICT 0x0C
+
+EVENT_PM_DATA_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_DATA_FROM_L3_MEPF 0x0C
+
+EVENT_PM_DATA_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_DATA_FROM_L3_NO_CONFLICT 0x0C
+
+EVENT_PM_DATA_FROM_LL4 0x4C PMC0
+UMASK_PM_DATA_FROM_LL4 0x0C
+
+EVENT_PM_DATA_FROM_LMEM 0x48 PMC1
+UMASK_PM_DATA_FROM_LMEM 0x0C
+
+EVENT_PM_DATA_FROM_MEM 0xFE PMC3
+UMASK_PM_DATA_FROM_MEM 0x00
+
+EVENT_PM_DATA_FROM_MEMORY 0x4C PMC1
+UMASK_PM_DATA_FROM_MEMORY 0x0C
+
+EVENT_PM_DATA_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_DATA_FROM_OFF_CHIP_CACHE 0x0C
+
+EVENT_PM_DATA_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_DATA_FROM_ON_CHIP_CACHE 0x0C
+
+EVENT_PM_DATA_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_DATA_FROM_RL2L3_MOD 0x0C
+
+EVENT_PM_DATA_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_DATA_FROM_RL2L3_SHR 0x0C
+
+EVENT_PM_DATA_FROM_RL4 0x4A PMC1
+UMASK_PM_DATA_FROM_RL4 0x0C
+
+EVENT_PM_DATA_FROM_RMEM 0x4A PMC2
+UMASK_PM_DATA_FROM_RMEM 0x0C
+
+EVENT_PM_DATA_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_DATA_GRP_PUMP_CPRED 0x0C
+
+EVENT_PM_DATA_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_DATA_GRP_PUMP_MPRED 0x0C
+
+EVENT_PM_DATA_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_DATA_GRP_PUMP_MPRED_RTY 0x0C
+
+EVENT_PM_DATA_PUMP_CPRED 0x54 PMC0
+UMASK_PM_DATA_PUMP_CPRED 0x0C
+
+EVENT_PM_DATA_PUMP_MPRED 0x52 PMC3
+UMASK_PM_DATA_PUMP_MPRED 0x0C
+
+EVENT_PM_DATA_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_DATA_SYS_PUMP_CPRED 0x0C
+
+EVENT_PM_DATA_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_DATA_SYS_PUMP_MPRED 0x0C
+
+EVENT_PM_DATA_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_DATA_SYS_PUMP_MPRED_RTY 0x0C
+
+EVENT_PM_DATA_TABLEWALK_CYC 0x1A PMC2
+UMASK_PM_DATA_TABLEWALK_CYC 0x00
+
+EVENT_PM_DC_COLLISIONS 0xBC PMC
+UMASK_PM_DC_COLLISIONS 0x0E
+
+EVENT_PM_DC_PREF_STREAM_ALLOC 0x50 PMC0
+UMASK_PM_DC_PREF_STREAM_ALLOC 0x0E
+
+EVENT_PM_DC_PREF_STREAM_CONF 0x50 PMC1
+UMASK_PM_DC_PREF_STREAM_CONF 0x0E
+
+EVENT_PM_DC_PREF_STREAM_FUZZY_CONF 0x50 PMC3
+UMASK_PM_DC_PREF_STREAM_FUZZY_CONF 0x0E
+
+EVENT_PM_DC_PREF_STREAM_STRIDED_CONF 0x50 PMC2
+UMASK_PM_DC_PREF_STREAM_STRIDED_CONF 0x0E
+
+EVENT_PM_DERAT_MISS_16G 0x54 PMC3
+UMASK_PM_DERAT_MISS_16G 0x0C
+
+EVENT_PM_DERAT_MISS_16M 0x54 PMC2
+UMASK_PM_DERAT_MISS_16M 0x0C
+
+EVENT_PM_DERAT_MISS_4K 0x56 PMC0
+UMASK_PM_DERAT_MISS_4K 0x0C
+
+EVENT_PM_DERAT_MISS_64K 0x54 PMC1
+UMASK_PM_DERAT_MISS_64K 0x0C
+
+EVENT_PM_DFU 0xBA PMC
+UMASK_PM_DFU 0x0B
+
+EVENT_PM_DFU_DCFFIX 0xBE PMC
+UMASK_PM_DFU_DCFFIX 0x0B
+
+EVENT_PM_DFU_DENBCD 0xBC PMC
+UMASK_PM_DFU_DENBCD 0x0B
+
+EVENT_PM_DFU_MC 0xB8 PMC
+UMASK_PM_DFU_MC 0x0B
+
+EVENT_PM_DISP_CLB_HELD_BAL 0x92 PMC
+UMASK_PM_DISP_CLB_HELD_BAL 0x02
+
+EVENT_PM_DISP_CLB_HELD_RES 0x94 PMC
+UMASK_PM_DISP_CLB_HELD_RES 0x02
+
+EVENT_PM_DISP_CLB_HELD_SB 0xA8 PMC
+UMASK_PM_DISP_CLB_HELD_SB 0x02
+
+EVENT_PM_DISP_CLB_HELD_SYNC 0x98 PMC
+UMASK_PM_DISP_CLB_HELD_SYNC 0x02
+
+EVENT_PM_DISP_CLB_HELD_TLBIE 0x96 PMC
+UMASK_PM_DISP_CLB_HELD_TLBIE 0x02
+
+EVENT_PM_DISP_HELD 0x06 PMC0
+UMASK_PM_DISP_HELD 0x00
+
+EVENT_PM_DISP_HELD_IQ_FULL 0x06 PMC1
+UMASK_PM_DISP_HELD_IQ_FULL 0x00
+
+EVENT_PM_DISP_HELD_MAP_FULL 0x2A PMC0
+UMASK_PM_DISP_HELD_MAP_FULL 0x00
+
+EVENT_PM_DISP_HELD_SRQ_FULL 0x18 PMC2
+UMASK_PM_DISP_HELD_SRQ_FULL 0x00
+
+EVENT_PM_DISP_HELD_SYNC_HOLD 0x3C PMC3
+UMASK_PM_DISP_HELD_SYNC_HOLD 0x00
+
+EVENT_PM_DISP_HOLD_GCT_FULL 0xA6 PMC
+UMASK_PM_DISP_HOLD_GCT_FULL 0x03
+
+EVENT_PM_DISP_WT 0x08 PMC2
+UMASK_PM_DISP_WT 0x00
+
+EVENT_PM_DPTEG_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_DPTEG_FROM_DL2L3_MOD 0x0E
+
+EVENT_PM_DPTEG_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_DPTEG_FROM_DL2L3_SHR 0x0E
+
+EVENT_PM_DPTEG_FROM_DL4 0x4C PMC2
+UMASK_PM_DPTEG_FROM_DL4 0x0E
+
+EVENT_PM_DPTEG_FROM_DMEM 0x4C PMC3
+UMASK_PM_DPTEG_FROM_DMEM 0x0E
+
+EVENT_PM_DPTEG_FROM_L2 0x42 PMC0
+UMASK_PM_DPTEG_FROM_L2 0x0E
+
+EVENT_PM_DPTEG_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_DPTEG_FROM_L21_MOD 0x0E
+
+EVENT_PM_DPTEG_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_DPTEG_FROM_L21_SHR 0x0E
+
+EVENT_PM_DPTEG_FROM_L2MISS 0x4E PMC0
+UMASK_PM_DPTEG_FROM_L2MISS 0x0E
+
+EVENT_PM_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x0E
+
+EVENT_PM_DPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_DPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x0E
+
+EVENT_PM_DPTEG_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_DPTEG_FROM_L2_MEPF 0x0E
+
+EVENT_PM_DPTEG_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_DPTEG_FROM_L2_NO_CONFLICT 0x0E
+
+EVENT_PM_DPTEG_FROM_L3 0x42 PMC3
+UMASK_PM_DPTEG_FROM_L3 0x0E
+
+EVENT_PM_DPTEG_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_DPTEG_FROM_L31_ECO_MOD 0x0E
+
+EVENT_PM_DPTEG_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_DPTEG_FROM_L31_ECO_SHR 0x0E
+
+EVENT_PM_DPTEG_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_DPTEG_FROM_L31_MOD 0x0E
+
+EVENT_PM_DPTEG_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_DPTEG_FROM_L31_SHR 0x0E
+
+EVENT_PM_DPTEG_FROM_L3MISS 0x4E PMC3
+UMASK_PM_DPTEG_FROM_L3MISS 0x0E
+
+EVENT_PM_DPTEG_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_DPTEG_FROM_L3_DISP_CONFLICT 0x0E
+
+EVENT_PM_DPTEG_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_DPTEG_FROM_L3_MEPF 0x0E
+
+EVENT_PM_DPTEG_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_DPTEG_FROM_L3_NO_CONFLICT 0x0E
+
+EVENT_PM_DPTEG_FROM_LL4 0x4C PMC0
+UMASK_PM_DPTEG_FROM_LL4 0x0E
+
+EVENT_PM_DPTEG_FROM_LMEM 0x48 PMC1
+UMASK_PM_DPTEG_FROM_LMEM 0x0E
+
+EVENT_PM_DPTEG_FROM_MEMORY 0x4C PMC1
+UMASK_PM_DPTEG_FROM_MEMORY 0x0E
+
+EVENT_PM_DPTEG_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_DPTEG_FROM_OFF_CHIP_CACHE 0x0E
+
+EVENT_PM_DPTEG_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_DPTEG_FROM_ON_CHIP_CACHE 0x0E
+
+EVENT_PM_DPTEG_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_DPTEG_FROM_RL2L3_MOD 0x0E
+
+EVENT_PM_DPTEG_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_DPTEG_FROM_RL2L3_SHR 0x0E
+
+EVENT_PM_DPTEG_FROM_RL4 0x4A PMC1
+UMASK_PM_DPTEG_FROM_RL4 0x0E
+
+EVENT_PM_DPTEG_FROM_RMEM 0x4A PMC2
+UMASK_PM_DPTEG_FROM_RMEM 0x0E
+
+EVENT_PM_DSLB_MISS 0x94 PMC
+UMASK_PM_DSLB_MISS 0x0D
+
+EVENT_PM_DSLB_MISS_ALT 0x16 PMC0
+UMASK_PM_DSLB_MISS_ALT 0x00
+
+EVENT_PM_DTLB_MISS 0xFC PMC2
+UMASK_PM_DTLB_MISS 0x00
+
+EVENT_PM_DTLB_MISS_16G 0x58 PMC0
+UMASK_PM_DTLB_MISS_16G 0x0C
+
+EVENT_PM_DTLB_MISS_16M 0x56 PMC3
+UMASK_PM_DTLB_MISS_16M 0x0C
+
+EVENT_PM_DTLB_MISS_4K 0x56 PMC1
+UMASK_PM_DTLB_MISS_4K 0x0C
+
+EVENT_PM_DTLB_MISS_64K 0x56 PMC2
+UMASK_PM_DTLB_MISS_64K 0x0C
+
+EVENT_PM_EAT_FORCE_MISPRED 0xA8 PMC
+UMASK_PM_EAT_FORCE_MISPRED 0x05
+
+EVENT_PM_EAT_FULL_CYC 0x84 PMC
+UMASK_PM_EAT_FULL_CYC 0x04
+
+EVENT_PM_EE_OFF_EXT_INT 0x80 PMC
+UMASK_PM_EE_OFF_EXT_INT 0x02
+
+EVENT_PM_EXT_INT 0xF8 PMC1
+UMASK_PM_EXT_INT 0x00
+
+EVENT_PM_FAV_TBEGIN 0xB4 PMC
+UMASK_PM_FAV_TBEGIN 0x02
+
+EVENT_PM_FLOP 0xF4 PMC0
+UMASK_PM_FLOP 0x00
+
+EVENT_PM_FLOP_SUM_SCALAR 0xAE PMC
+UMASK_PM_FLOP_SUM_SCALAR 0x0A
+
+EVENT_PM_FLOP_SUM_VEC 0xAC PMC
+UMASK_PM_FLOP_SUM_VEC 0x0A
+
+EVENT_PM_FLUSH 0xF8 PMC3
+UMASK_PM_FLUSH 0x00
+
+EVENT_PM_FLUSH_BR_MPRED 0x84 PMC
+UMASK_PM_FLUSH_BR_MPRED 0x02
+
+EVENT_PM_FLUSH_COMPLETION 0x12 PMC2
+UMASK_PM_FLUSH_COMPLETION 0x00
+
+EVENT_PM_FLUSH_DISP 0x82 PMC
+UMASK_PM_FLUSH_DISP 0x02
+
+EVENT_PM_FLUSH_DISP_SB 0x8C PMC
+UMASK_PM_FLUSH_DISP_SB 0x02
+
+EVENT_PM_FLUSH_DISP_SYNC 0x88 PMC
+UMASK_PM_FLUSH_DISP_SYNC 0x02
+
+EVENT_PM_FLUSH_DISP_TLBIE 0x8A PMC
+UMASK_PM_FLUSH_DISP_TLBIE 0x02
+
+EVENT_PM_FLUSH_LSU 0x8E PMC
+UMASK_PM_FLUSH_LSU 0x02
+
+EVENT_PM_FLUSH_PARTIAL 0x86 PMC
+UMASK_PM_FLUSH_PARTIAL 0x02
+
+EVENT_PM_FPU0_FCONV 0xB0 PMC
+UMASK_PM_FPU0_FCONV 0x0A
+
+EVENT_PM_FPU0_FEST 0xB8 PMC
+UMASK_PM_FPU0_FEST 0x0A
+
+EVENT_PM_FPU0_FRSP 0xB4 PMC
+UMASK_PM_FPU0_FRSP 0x0A
+
+EVENT_PM_FPU1_FCONV 0xB2 PMC
+UMASK_PM_FPU1_FCONV 0x0A
+
+EVENT_PM_FPU1_FEST 0xBA PMC
+UMASK_PM_FPU1_FEST 0x0A
+
+EVENT_PM_FPU1_FRSP 0xB6 PMC
+UMASK_PM_FPU1_FRSP 0x0A
+
+EVENT_PM_FREQ_DOWN 0x0C PMC2
+UMASK_PM_FREQ_DOWN 0x00
+
+EVENT_PM_FREQ_UP 0x0C PMC3
+UMASK_PM_FREQ_UP 0x00
+
+EVENT_PM_FUSION_TOC_GRP0_1 0xB0 PMC
+UMASK_PM_FUSION_TOC_GRP0_1 0x05
+
+EVENT_PM_FUSION_TOC_GRP0_2 0xAE PMC
+UMASK_PM_FUSION_TOC_GRP0_2 0x05
+
+EVENT_PM_FUSION_TOC_GRP0_3 0xAC PMC
+UMASK_PM_FUSION_TOC_GRP0_3 0x05
+
+EVENT_PM_FUSION_TOC_GRP1_1 0xB2 PMC
+UMASK_PM_FUSION_TOC_GRP1_1 0x05
+
+EVENT_PM_FUSION_VSX_GRP0_1 0xB8 PMC
+UMASK_PM_FUSION_VSX_GRP0_1 0x05
+
+EVENT_PM_FUSION_VSX_GRP0_2 0xB6 PMC
+UMASK_PM_FUSION_VSX_GRP0_2 0x05
+
+EVENT_PM_FUSION_VSX_GRP0_3 0xB4 PMC
+UMASK_PM_FUSION_VSX_GRP0_3 0x05
+
+EVENT_PM_FUSION_VSX_GRP1_1 0xBA PMC
+UMASK_PM_FUSION_VSX_GRP1_1 0x05
+
+EVENT_PM_FXU0_BUSY_FXU1_IDLE 0x0E PMC2
+UMASK_PM_FXU0_BUSY_FXU1_IDLE 0x00
+
+EVENT_PM_FXU0_FIN 0x04 PMC0
+UMASK_PM_FXU0_FIN 0x00
+
+EVENT_PM_FXU1_BUSY_FXU0_IDLE 0x0E PMC3
+UMASK_PM_FXU1_BUSY_FXU0_IDLE 0x00
+
+EVENT_PM_FXU1_FIN 0x04 PMC3
+UMASK_PM_FXU1_FIN 0x00
+
+EVENT_PM_FXU_BUSY 0x0E PMC1
+UMASK_PM_FXU_BUSY 0x00
+
+EVENT_PM_FXU_IDLE 0x0E PMC0
+UMASK_PM_FXU_IDLE 0x00
+
+EVENT_PM_GCT_EMPTY_CYC 0x08 PMC1
+UMASK_PM_GCT_EMPTY_CYC 0x00
+
+EVENT_PM_GCT_MERGE 0xA4 PMC
+UMASK_PM_GCT_MERGE 0x03
+
+EVENT_PM_GCT_NOSLOT_BR_MPRED 0x1E PMC3
+UMASK_PM_GCT_NOSLOT_BR_MPRED 0x0D
+
+EVENT_PM_GCT_NOSLOT_BR_MPRED_ICMISS 0x1A PMC3
+UMASK_PM_GCT_NOSLOT_BR_MPRED_ICMISS 0x0D
+
+EVENT_PM_GCT_NOSLOT_CYC 0xF8 PMC0
+UMASK_PM_GCT_NOSLOT_CYC 0x00
+
+EVENT_PM_GCT_NOSLOT_DISP_HELD_ISSQ 0x1E PMC1
+UMASK_PM_GCT_NOSLOT_DISP_HELD_ISSQ 0x0D
+
+EVENT_PM_GCT_NOSLOT_DISP_HELD_MAP 0x1C PMC3
+UMASK_PM_GCT_NOSLOT_DISP_HELD_MAP 0x0D
+
+EVENT_PM_GCT_NOSLOT_DISP_HELD_OTHER 0x10 PMC1
+UMASK_PM_GCT_NOSLOT_DISP_HELD_OTHER 0x0E
+
+EVENT_PM_GCT_NOSLOT_DISP_HELD_SRQ 0x1C PMC1
+UMASK_PM_GCT_NOSLOT_DISP_HELD_SRQ 0x0D
+
+EVENT_PM_GCT_NOSLOT_IC_L3MISS 0x10 PMC3
+UMASK_PM_GCT_NOSLOT_IC_L3MISS 0x0E
+
+EVENT_PM_GCT_NOSLOT_IC_MISS 0x1A PMC1
+UMASK_PM_GCT_NOSLOT_IC_MISS 0x0D
+
+EVENT_PM_GCT_UTIL_11_14_ENTRIES 0xA2 PMC
+UMASK_PM_GCT_UTIL_11_14_ENTRIES 0x02
+
+EVENT_PM_GCT_UTIL_15_17_ENTRIES 0xA4 PMC
+UMASK_PM_GCT_UTIL_15_17_ENTRIES 0x02
+
+EVENT_PM_GCT_UTIL_18_ENTRIES 0xA6 PMC
+UMASK_PM_GCT_UTIL_18_ENTRIES 0x02
+
+EVENT_PM_GCT_UTIL_1_2_ENTRIES 0x9C PMC
+UMASK_PM_GCT_UTIL_1_2_ENTRIES 0x02
+
+EVENT_PM_GCT_UTIL_3_6_ENTRIES 0x9E PMC
+UMASK_PM_GCT_UTIL_3_6_ENTRIES 0x02
+
+EVENT_PM_GCT_UTIL_7_10_ENTRIES 0xA0 PMC
+UMASK_PM_GCT_UTIL_7_10_ENTRIES 0x02
+
+EVENT_PM_GRP_BR_MPRED_NONSPEC 0x0A PMC0
+UMASK_PM_GRP_BR_MPRED_NONSPEC 0x00
+
+EVENT_PM_GRP_CMPL 0x04 PMC2
+UMASK_PM_GRP_CMPL 0x00
+
+EVENT_PM_GRP_DISP 0x0A PMC2
+UMASK_PM_GRP_DISP 0x00
+
+EVENT_PM_GRP_IC_MISS_NONSPEC 0x0C PMC0
+UMASK_PM_GRP_IC_MISS_NONSPEC 0x00
+
+EVENT_PM_GRP_MRK 0x30 PMC0
+UMASK_PM_GRP_MRK 0x00 0x20 0x00
+
+EVENT_PM_GRP_NON_FULL_GROUP 0x9C PMC
+UMASK_PM_GRP_NON_FULL_GROUP 0x05
+
+EVENT_PM_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_GRP_PUMP_CPRED 0x00
+
+EVENT_PM_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_GRP_PUMP_MPRED 0x00
+
+EVENT_PM_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_GRP_PUMP_MPRED_RTY 0x00
+
+EVENT_PM_GRP_TERM_2ND_BRANCH 0xA4 PMC
+UMASK_PM_GRP_TERM_2ND_BRANCH 0x05
+
+EVENT_PM_GRP_TERM_FPU_AFTER_BR 0xA6 PMC
+UMASK_PM_GRP_TERM_FPU_AFTER_BR 0x05
+
+EVENT_PM_GRP_TERM_NOINST 0x9E PMC
+UMASK_PM_GRP_TERM_NOINST 0x05
+
+EVENT_PM_GRP_TERM_OTHER 0xA0 PMC
+UMASK_PM_GRP_TERM_OTHER 0x05
+
+EVENT_PM_GRP_TERM_SLOT_LIMIT 0xA2 PMC
+UMASK_PM_GRP_TERM_SLOT_LIMIT 0x05
+
+EVENT_PM_HV_CYC 0x0A PMC1
+UMASK_PM_HV_CYC 0x00
+
+EVENT_PM_IBUF_FULL_CYC 0x86 PMC
+UMASK_PM_IBUF_FULL_CYC 0x04
+
+EVENT_PM_IC_DEMAND_CYC 0x18 PMC0
+UMASK_PM_IC_DEMAND_CYC 0x00
+
+EVENT_PM_IC_DEMAND_L2_BHT_REDIRECT 0x98 PMC
+UMASK_PM_IC_DEMAND_L2_BHT_REDIRECT 0x04
+
+EVENT_PM_IC_DEMAND_L2_BR_REDIRECT 0x9A PMC
+UMASK_PM_IC_DEMAND_L2_BR_REDIRECT 0x04
+
+EVENT_PM_IC_DEMAND_REQ 0x88 PMC
+UMASK_PM_IC_DEMAND_REQ 0x04
+
+EVENT_PM_IC_INVALIDATE 0x8A PMC
+UMASK_PM_IC_INVALIDATE 0x05
+
+EVENT_PM_IC_PREF_CANCEL_HIT 0x92 PMC
+UMASK_PM_IC_PREF_CANCEL_HIT 0x04
+
+EVENT_PM_IC_PREF_CANCEL_L2 0x94 PMC
+UMASK_PM_IC_PREF_CANCEL_L2 0x04
+
+EVENT_PM_IC_PREF_CANCEL_PAGE 0x90 PMC
+UMASK_PM_IC_PREF_CANCEL_PAGE 0x04
+
+EVENT_PM_IC_PREF_REQ 0x8A PMC
+UMASK_PM_IC_PREF_REQ 0x04
+
+EVENT_PM_IC_PREF_WRITE 0x8E PMC
+UMASK_PM_IC_PREF_WRITE 0x04
+
+EVENT_PM_IC_RELOAD_PRIVATE 0x96 PMC
+UMASK_PM_IC_RELOAD_PRIVATE 0x04
+
+EVENT_PM_IERAT_RELOAD 0xF6 PMC0
+UMASK_PM_IERAT_RELOAD 0x00
+
+EVENT_PM_IERAT_RELOAD_16M 0x6A PMC3
+UMASK_PM_IERAT_RELOAD_16M 0x00
+
+EVENT_PM_IERAT_RELOAD_4K 0x64 PMC1
+UMASK_PM_IERAT_RELOAD_4K 0x00
+
+EVENT_PM_IERAT_RELOAD_64K 0x6A PMC2
+UMASK_PM_IERAT_RELOAD_64K 0x00
+
+EVENT_PM_IFETCH_THROTTLE 0x5E PMC2
+UMASK_PM_IFETCH_THROTTLE 0x04
+
+EVENT_PM_IFU_L2_TOUCH 0x88 PMC
+UMASK_PM_IFU_L2_TOUCH 0x05
+
+EVENT_PM_INST_ALL_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_INST_ALL_CHIP_PUMP_CPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_INST_ALL_FROM_DL2L3_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_INST_ALL_FROM_DL2L3_SHR 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_DL4 0x4C PMC2
+UMASK_PM_INST_ALL_FROM_DL4 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_DMEM 0x4C PMC3
+UMASK_PM_INST_ALL_FROM_DMEM 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2 0x42 PMC0
+UMASK_PM_INST_ALL_FROM_L2 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_INST_ALL_FROM_L21_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_INST_ALL_FROM_L21_SHR 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2MISS 0x4E PMC0
+UMASK_PM_INST_ALL_FROM_L2MISS 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_INST_ALL_FROM_L2_DISP_CONFLICT_LDHITST 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_INST_ALL_FROM_L2_DISP_CONFLICT_OTHER 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_INST_ALL_FROM_L2_MEPF 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_INST_ALL_FROM_L2_NO_CONFLICT 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L3 0x42 PMC3
+UMASK_PM_INST_ALL_FROM_L3 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_INST_ALL_FROM_L31_ECO_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_INST_ALL_FROM_L31_ECO_SHR 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_INST_ALL_FROM_L31_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_INST_ALL_FROM_L31_SHR 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_INST_ALL_FROM_L3MISS_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_INST_ALL_FROM_L3_DISP_CONFLICT 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_INST_ALL_FROM_L3_MEPF 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_INST_ALL_FROM_L3_NO_CONFLICT 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_LL4 0x4C PMC0
+UMASK_PM_INST_ALL_FROM_LL4 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_LMEM 0x48 PMC1
+UMASK_PM_INST_ALL_FROM_LMEM 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_MEMORY 0x4C PMC1
+UMASK_PM_INST_ALL_FROM_MEMORY 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_INST_ALL_FROM_OFF_CHIP_CACHE 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_INST_ALL_FROM_ON_CHIP_CACHE 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_INST_ALL_FROM_RL2L3_MOD 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_INST_ALL_FROM_RL2L3_SHR 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_RL4 0x4A PMC1
+UMASK_PM_INST_ALL_FROM_RL4 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_FROM_RMEM 0x4A PMC2
+UMASK_PM_INST_ALL_FROM_RMEM 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_INST_ALL_GRP_PUMP_CPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_INST_ALL_GRP_PUMP_MPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_INST_ALL_GRP_PUMP_MPRED_RTY 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_PUMP_CPRED 0x54 PMC0
+UMASK_PM_INST_ALL_PUMP_CPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_PUMP_MPRED 0x52 PMC3
+UMASK_PM_INST_ALL_PUMP_MPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_INST_ALL_SYS_PUMP_CPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_INST_ALL_SYS_PUMP_MPRED 0x04 0x05 0x00
+
+EVENT_PM_INST_ALL_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_INST_ALL_SYS_PUMP_MPRED_RTY 0x04 0x05 0x00
+
+EVENT_PM_INST_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_INST_CHIP_PUMP_CPRED 0x04
+
+EVENT_PM_INST_CMPL 0x02 PMC
+UMASK_PM_INST_CMPL 0x00
+
+EVENT_PM_INST_DISP 0xF2 PMC1|PMC2
+UMASK_PM_INST_DISP 0x00
+
+EVENT_PM_INST_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_INST_FROM_DL2L3_MOD 0x04
+
+EVENT_PM_INST_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_INST_FROM_DL2L3_SHR 0x04
+
+EVENT_PM_INST_FROM_DL4 0x4C PMC2
+UMASK_PM_INST_FROM_DL4 0x04
+
+EVENT_PM_INST_FROM_DMEM 0x4C PMC3
+UMASK_PM_INST_FROM_DMEM 0x04
+
+EVENT_PM_INST_FROM_L1 0x80 PMC
+UMASK_PM_INST_FROM_L1 0x04
+
+EVENT_PM_INST_FROM_L2 0x42 PMC0
+UMASK_PM_INST_FROM_L2 0x04
+
+EVENT_PM_INST_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_INST_FROM_L21_MOD 0x04
+
+EVENT_PM_INST_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_INST_FROM_L21_SHR 0x04
+
+EVENT_PM_INST_FROM_L2MISS 0x4E PMC0
+UMASK_PM_INST_FROM_L2MISS 0x04
+
+EVENT_PM_INST_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_INST_FROM_L2_DISP_CONFLICT_LDHITST 0x04
+
+EVENT_PM_INST_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_INST_FROM_L2_DISP_CONFLICT_OTHER 0x04
+
+EVENT_PM_INST_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_INST_FROM_L2_MEPF 0x04
+
+EVENT_PM_INST_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_INST_FROM_L2_NO_CONFLICT 0x04
+
+EVENT_PM_INST_FROM_L3 0x42 PMC3
+UMASK_PM_INST_FROM_L3 0x04
+
+EVENT_PM_INST_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_INST_FROM_L31_ECO_MOD 0x04
+
+EVENT_PM_INST_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_INST_FROM_L31_ECO_SHR 0x04
+
+EVENT_PM_INST_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_INST_FROM_L31_MOD 0x04
+
+EVENT_PM_INST_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_INST_FROM_L31_SHR 0x04
+
+EVENT_PM_INST_FROM_L3MISS 0xFA PMC2
+UMASK_PM_INST_FROM_L3MISS 0x00
+
+EVENT_PM_INST_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_INST_FROM_L3MISS_MOD 0x04
+
+EVENT_PM_INST_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_INST_FROM_L3_DISP_CONFLICT 0x04
+
+EVENT_PM_INST_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_INST_FROM_L3_MEPF 0x04
+
+EVENT_PM_INST_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_INST_FROM_L3_NO_CONFLICT 0x04
+
+EVENT_PM_INST_FROM_LL4 0x4C PMC0
+UMASK_PM_INST_FROM_LL4 0x04
+
+EVENT_PM_INST_FROM_LMEM 0x48 PMC1
+UMASK_PM_INST_FROM_LMEM 0x04
+
+EVENT_PM_INST_FROM_MEMORY 0x4C PMC1
+UMASK_PM_INST_FROM_MEMORY 0x04
+
+EVENT_PM_INST_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_INST_FROM_OFF_CHIP_CACHE 0x04
+
+EVENT_PM_INST_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_INST_FROM_ON_CHIP_CACHE 0x04
+
+EVENT_PM_INST_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_INST_FROM_RL2L3_MOD 0x04
+
+EVENT_PM_INST_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_INST_FROM_RL2L3_SHR 0x04
+
+EVENT_PM_INST_FROM_RL4 0x4A PMC1
+UMASK_PM_INST_FROM_RL4 0x04
+
+EVENT_PM_INST_FROM_RMEM 0x4A PMC2
+UMASK_PM_INST_FROM_RMEM 0x04
+
+EVENT_PM_INST_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_INST_GRP_PUMP_CPRED 0x04
+
+EVENT_PM_INST_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_INST_GRP_PUMP_MPRED 0x04
+
+EVENT_PM_INST_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_INST_GRP_PUMP_MPRED_RTY 0x04
+
+EVENT_PM_INST_IMC_MATCH_CMPL 0x3A PMC0
+UMASK_PM_INST_IMC_MATCH_CMPL 0x00
+
+EVENT_PM_INST_IMC_MATCH_DISP 0x16 PMC2
+UMASK_PM_INST_IMC_MATCH_DISP 0x00
+
+EVENT_PM_INST_PUMP_CPRED 0x54 PMC0
+UMASK_PM_INST_PUMP_CPRED 0x04
+
+EVENT_PM_INST_PUMP_MPRED 0x52 PMC3
+UMASK_PM_INST_PUMP_MPRED 0x04
+
+EVENT_PM_INST_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_INST_SYS_PUMP_CPRED 0x04
+
+EVENT_PM_INST_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_INST_SYS_PUMP_MPRED 0x04
+
+EVENT_PM_INST_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_INST_SYS_PUMP_MPRED_RTY 0x04
+
+EVENT_PM_IOPS_CMPL 0x14 PMC0
+UMASK_PM_IOPS_CMPL 0x00
+
+EVENT_PM_IOPS_DISP 0x14 PMC2
+UMASK_PM_IOPS_DISP 0x00
+
+EVENT_PM_IPTEG_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_IPTEG_FROM_DL2L3_MOD 0x05
+
+EVENT_PM_IPTEG_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_IPTEG_FROM_DL2L3_SHR 0x05
+
+EVENT_PM_IPTEG_FROM_DL4 0x4C PMC2
+UMASK_PM_IPTEG_FROM_DL4 0x05
+
+EVENT_PM_IPTEG_FROM_DMEM 0x4C PMC3
+UMASK_PM_IPTEG_FROM_DMEM 0x05
+
+EVENT_PM_IPTEG_FROM_L2 0x42 PMC0
+UMASK_PM_IPTEG_FROM_L2 0x05
+
+EVENT_PM_IPTEG_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_IPTEG_FROM_L21_MOD 0x05
+
+EVENT_PM_IPTEG_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_IPTEG_FROM_L21_SHR 0x05
+
+EVENT_PM_IPTEG_FROM_L2MISS 0x4E PMC0
+UMASK_PM_IPTEG_FROM_L2MISS 0x05
+
+EVENT_PM_IPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_IPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x05
+
+EVENT_PM_IPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_IPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x05
+
+EVENT_PM_IPTEG_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_IPTEG_FROM_L2_MEPF 0x05
+
+EVENT_PM_IPTEG_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_IPTEG_FROM_L2_NO_CONFLICT 0x05
+
+EVENT_PM_IPTEG_FROM_L3 0x42 PMC3
+UMASK_PM_IPTEG_FROM_L3 0x05
+
+EVENT_PM_IPTEG_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_IPTEG_FROM_L31_ECO_MOD 0x05
+
+EVENT_PM_IPTEG_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_IPTEG_FROM_L31_ECO_SHR 0x05
+
+EVENT_PM_IPTEG_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_IPTEG_FROM_L31_MOD 0x05
+
+EVENT_PM_IPTEG_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_IPTEG_FROM_L31_SHR 0x05
+
+EVENT_PM_IPTEG_FROM_L3MISS 0x4E PMC3
+UMASK_PM_IPTEG_FROM_L3MISS 0x05
+
+EVENT_PM_IPTEG_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_IPTEG_FROM_L3_DISP_CONFLICT 0x05
+
+EVENT_PM_IPTEG_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_IPTEG_FROM_L3_MEPF 0x05
+
+EVENT_PM_IPTEG_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_IPTEG_FROM_L3_NO_CONFLICT 0x05
+
+EVENT_PM_IPTEG_FROM_LL4 0x4C PMC0
+UMASK_PM_IPTEG_FROM_LL4 0x05
+
+EVENT_PM_IPTEG_FROM_LMEM 0x48 PMC1
+UMASK_PM_IPTEG_FROM_LMEM 0x05
+
+EVENT_PM_IPTEG_FROM_MEMORY 0x4C PMC1
+UMASK_PM_IPTEG_FROM_MEMORY 0x05
+
+EVENT_PM_IPTEG_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_IPTEG_FROM_OFF_CHIP_CACHE 0x05
+
+EVENT_PM_IPTEG_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_IPTEG_FROM_ON_CHIP_CACHE 0x05
+
+EVENT_PM_IPTEG_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_IPTEG_FROM_RL2L3_MOD 0x05
+
+EVENT_PM_IPTEG_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_IPTEG_FROM_RL2L3_SHR 0x05
+
+EVENT_PM_IPTEG_FROM_RL4 0x4A PMC1
+UMASK_PM_IPTEG_FROM_RL4 0x05
+
+EVENT_PM_IPTEG_FROM_RMEM 0x4A PMC2
+UMASK_PM_IPTEG_FROM_RMEM 0x05
+
+EVENT_PM_ISIDE_L2MEMACC 0x8E PMC3
+UMASK_PM_ISIDE_L2MEMACC 0x06
+
+EVENT_PM_ISIDE_L2MEMACC_ALT 0x8E PMC3
+UMASK_PM_ISIDE_L2MEMACC_ALT 0x06 0x07 0x00
+
+EVENT_PM_ISLB_MISS 0x96 PMC
+UMASK_PM_ISLB_MISS 0x0D
+
+EVENT_PM_ISLB_MISS_ALT 0x06 PMC3
+UMASK_PM_ISLB_MISS_ALT 0x00
+
+EVENT_PM_ISU_REF_FX0 0xAC PMC
+UMASK_PM_ISU_REF_FX0 0x03
+
+EVENT_PM_ISU_REF_FX1 0xAE PMC
+UMASK_PM_ISU_REF_FX1 0x03
+
+EVENT_PM_ISU_REF_FXU 0xAC PMC
+UMASK_PM_ISU_REF_FXU 0x03 0x10 0x00
+
+EVENT_PM_ISU_REF_LS0 0xB0 PMC
+UMASK_PM_ISU_REF_LS0 0x03
+
+EVENT_PM_ISU_REF_LS1 0xB2 PMC
+UMASK_PM_ISU_REF_LS1 0x03
+
+EVENT_PM_ISU_REF_LS2 0xB4 PMC
+UMASK_PM_ISU_REF_LS2 0x03
+
+EVENT_PM_ISU_REF_LS3 0xB6 PMC
+UMASK_PM_ISU_REF_LS3 0x03
+
+EVENT_PM_ISU_REJECTS_ALL 0x9C PMC
+UMASK_PM_ISU_REJECTS_ALL 0x03
+
+EVENT_PM_ISU_REJECT_RES_NA 0xA2 PMC
+UMASK_PM_ISU_REJECT_RES_NA 0x03
+
+EVENT_PM_ISU_REJECT_SAR_BYPASS 0x9E PMC
+UMASK_PM_ISU_REJECT_SAR_BYPASS 0x03
+
+EVENT_PM_ISU_REJECT_SRC_NA 0xA0 PMC
+UMASK_PM_ISU_REJECT_SRC_NA 0x03
+
+EVENT_PM_ISU_REJ_VS0 0xA8 PMC
+UMASK_PM_ISU_REJ_VS0 0x03
+
+EVENT_PM_ISU_REJ_VS1 0xAA PMC
+UMASK_PM_ISU_REJ_VS1 0x03
+
+EVENT_PM_ISU_REJ_VSU 0xA8 PMC
+UMASK_PM_ISU_REJ_VSU 0x03 0x10 0x00
+
+EVENT_PM_ISYNC 0xB8 PMC
+UMASK_PM_ISYNC 0x03
+
+EVENT_PM_ITLB_MISS 0xFC PMC3
+UMASK_PM_ITLB_MISS 0x00
+
+EVENT_PM_L1MISS_LAT_EXC_1024 0xEA PMC2
+UMASK_PM_L1MISS_LAT_EXC_1024 0x00 0x60 0x01
+
+EVENT_PM_L1MISS_LAT_EXC_2048 0xEC PMC3
+UMASK_PM_L1MISS_LAT_EXC_2048 0x00 0x60 0x01
+
+EVENT_PM_L1MISS_LAT_EXC_256 0xE8 PMC0
+UMASK_PM_L1MISS_LAT_EXC_256 0x00 0x60 0x01
+
+EVENT_PM_L1MISS_LAT_EXC_32 0xE6 PMC1
+UMASK_PM_L1MISS_LAT_EXC_32 0x00 0x60 0x01
+
+EVENT_PM_L1PF_L2MEMACC 0x86 PMC1
+UMASK_PM_L1PF_L2MEMACC 0x06
+
+EVENT_PM_L1PF_L2MEMACC_ALT 0x86 PMC1
+UMASK_PM_L1PF_L2MEMACC_ALT 0x06 0x07 0x00
+
+EVENT_PM_L1_DCACHE_RELOADED_ALL 0x2C PMC0
+UMASK_PM_L1_DCACHE_RELOADED_ALL 0x00
+
+EVENT_PM_L1_DCACHE_RELOAD_VALID 0xF6 PMC2
+UMASK_PM_L1_DCACHE_RELOAD_VALID 0x00
+
+EVENT_PM_L1_DEMAND_WRITE 0x8C PMC
+UMASK_PM_L1_DEMAND_WRITE 0x04
+
+EVENT_PM_L1_ICACHE_MISS 0xFD PMC1
+UMASK_PM_L1_ICACHE_MISS 0x00
+
+EVENT_PM_L1_ICACHE_RELOADED_ALL 0x12 PMC3
+UMASK_PM_L1_ICACHE_RELOADED_ALL 0x00
+
+EVENT_PM_L1_ICACHE_RELOADED_PREF 0x68 PMC2
+UMASK_PM_L1_ICACHE_RELOADED_PREF 0x00
+
+EVENT_PM_L2_CHIP_PUMP 0x84 PMC1
+UMASK_PM_L2_CHIP_PUMP 0x07
+
+EVENT_PM_L2_CHIP_PUMP_ALT 0x8C PMC3
+UMASK_PM_L2_CHIP_PUMP_ALT 0x06 0x06 0x00
+
+EVENT_PM_L2_GROUP_PUMP 0x86 PMC1
+UMASK_PM_L2_GROUP_PUMP 0x07
+
+EVENT_PM_L2_GROUP_PUMP_ALT 0x8E PMC3
+UMASK_PM_L2_GROUP_PUMP_ALT 0x06 0x06 0x00
+
+EVENT_PM_L2_RTY_ST 0x8A PMC2
+UMASK_PM_L2_RTY_ST 0x07
+
+EVENT_PM_L2_RTY_ST_ALT 0x8C PMC3
+UMASK_PM_L2_RTY_ST_ALT 0x07
+
+EVENT_PM_L2_RTY_LD  0x8C PMC3
+UMASK_PM_L2_RTY_LD  0x07
+
+EVENT_PM_L2_RTY_LD_ALT  0x8A PMC2
+UMASK_PM_L2_RTY_LD_ALT  0x07 0x06 0x00
+
+EVENT_PM_L2_ST 0x80 PMC0
+UMASK_PM_L2_ST 0x07
+
+EVENT_PM_L2_ST_ALT 0x82 PMC0
+UMASK_PM_L2_ST_ALT 0x06 0x04 0x00
+
+EVENT_PM_L2_ST_DISP 0x8C PMC3
+UMASK_PM_L2_ST_DISP 0x07 0x04 0x00
+
+EVENT_PM_L2_ST_HIT 0x8E PMC3
+UMASK_PM_L2_ST_HIT 0x07 0x04 0x00
+
+EVENT_PM_L2_ST_MISS 0x82 PMC0
+UMASK_PM_L2_ST_MISS 0x07
+
+EVENT_PM_L2_ST_MISS_ALT 0x86 PMC1
+UMASK_PM_L2_ST_MISS_ALT 0x06 0x04 0x00
+
+EVENT_PM_L2_LD 0x80 PMC0
+UMASK_PM_L2_LD 0x06 0x04 0x00
+
+EVENT_PM_L2_LD_DISP 0x88 PMC2
+UMASK_PM_L2_LD_DISP 0x07 0x04 0x00
+
+EVENT_PM_L2_LD_HIT 0x8A PMC2
+UMASK_PM_L2_LD_HIT 0x07 0x04 0x00
+
+EVENT_PM_L2_LD_MISS 0x84 PMC1
+UMASK_PM_L2_LD_MISS 0x06 0x04 0x00
+
+EVENT_PM_L2_TM_REQ_ABORT 0x5E PMC0
+UMASK_PM_L2_TM_REQ_ABORT 0x0E
+
+EVENT_PM_L2_TM_ST_ABORT_SISTER 0x5C PMC2
+UMASK_PM_L2_TM_ST_ABORT_SISTER 0x0E
+
+EVENT_PM_L2_SYS_PUMP  0x88 PMC2
+UMASK_PM_L2_SYS_PUMP  0x07
+
+EVENT_PM_L2_SYS_PUMP_ALT  0x80 PMC2
+UMASK_PM_L2_SYS_PUMP_ALT  0x07 0x06 0x00
+
+EVENT_PM_L3_CI_USAGE 0x82 PMC0
+UMASK_PM_L3_CI_USAGE 0x09 0x08 0x00
+
+EVENT_PM_L3_CO0_ALLOC 0x8B PMC2
+UMASK_PM_L3_CO0_ALLOC 0x09 0x08 0x00
+
+EVENT_PM_L3_CO0_BUSY 0x8A PMC2
+UMASK_PM_L3_CO0_BUSY 0x09 0x08 0x00
+
+EVENT_PM_L3_CO_L31 0x86 PMC1
+UMASK_PM_L3_CO_L31 0x08
+
+EVENT_PM_L3_CO_MEM 0x84 PMC1
+UMASK_PM_L3_CO_MEM 0x08
+
+EVENT_PM_L3_CO_MEPF 0x82 PMC0
+UMASK_PM_L3_CO_MEPF 0x08
+
+EVENT_PM_L3_CO_MEPF_ALT 0x5E PMC2
+UMASK_PM_L3_CO_MEPF_ALT 0x0E
+
+EVENT_PM_L3_LD_PREF 0x52 PMC0
+UMASK_PM_L3_LD_PREF 0x0E
+
+EVENT_PM_L3_PF0_ALLOC 0x8D PMC3
+UMASK_PM_L3_PF0_ALLOC 0x09 0x08 0x00
+
+EVENT_PM_L3_PF0_BUSY 0x8C PMC3
+UMASK_PM_L3_PF0_BUSY 0x09 0x08 0x00
+
+EVENT_PM_L3_PF_MISS_L3 0x80 PMC0
+UMASK_PM_L3_PF_MISS_L3 0x08
+
+EVENT_PM_L3_PF_OFF_CHIP_CACHE 0x8A PMC2
+UMASK_PM_L3_PF_OFF_CHIP_CACHE 0x08
+
+EVENT_PM_L3_PF_OFF_CHIP_MEM 0x8E PMC3
+UMASK_PM_L3_PF_OFF_CHIP_MEM 0x08
+
+EVENT_PM_L3_PF_ON_CHIP_CACHE 0x88 PMC2
+UMASK_PM_L3_PF_ON_CHIP_CACHE 0x08
+
+EVENT_PM_L3_PF_ON_CHIP_MEM 0x8C PMC3
+UMASK_PM_L3_PF_ON_CHIP_MEM 0x08
+
+EVENT_PM_L3_PF_USAGE 0x84 PMC1
+UMASK_PM_L3_PF_USAGE 0x09 0x08 0x00
+
+EVENT_PM_L3_PREF_ALL 0x52 PMC3
+UMASK_PM_L3_PREF_ALL 0x0E
+
+EVENT_PM_L3_RD0_ALLOC 0x8F PMC3
+UMASK_PM_L3_RD0_ALLOC 0x09 0x08 0x00
+
+EVENT_PM_L3_RD0_BUSY 0x8E PMC3
+UMASK_PM_L3_RD0_BUSY 0x09 0x08 0x00
+
+EVENT_PM_L3_RD_USAGE 0x86 PMC1
+UMASK_PM_L3_RD_USAGE 0x09 0x08 0x00
+
+EVENT_PM_L3_SN0_ALLOC 0x89 PMC2
+UMASK_PM_L3_SN0_ALLOC 0x09 0x08 0x00
+
+EVENT_PM_L3_SN0_BUSY 0x88 PMC2
+UMASK_PM_L3_SN0_BUSY 0x09 0x08 0x00
+
+EVENT_PM_L3_SN_USAGE 0x80 PMC0
+UMASK_PM_L3_SN_USAGE 0x09 0x08 0x00
+
+EVENT_PM_L3_ST_PREF 0x52 PMC1
+UMASK_PM_L3_ST_PREF 0x0E
+
+EVENT_PM_L3_SW_PREF 0x52 PMC2
+UMASK_PM_L3_SW_PREF 0x0E
+
+EVENT_PM_L3_WI0_ALLOC 0x81 PMC0
+UMASK_PM_L3_WI0_ALLOC 0x08
+
+EVENT_PM_LARX_FIN 0x58 PMC2
+UMASK_PM_LARX_FIN 0x0C
+
+EVENT_PM_LD_CMPL 0x2E PMC0
+UMASK_PM_LD_CMPL 0x00
+
+EVENT_PM_LD_L3MISS_PEND_CYC 0x62 PMC0
+UMASK_PM_LD_L3MISS_PEND_CYC 0x00
+
+EVENT_PM_LD_MISS_L1 0x54 PMC2
+UMASK_PM_LD_MISS_L1 0x0E
+
+EVENT_PM_LD_MISS_L1_ALT 0xF0 PMC3
+UMASK_PM_LD_MISS_L1_ALT 0x00
+
+EVENT_PM_LD_REF_L1 0xEE PMC0
+UMASK_PM_LD_REF_L1 0x00
+
+EVENT_PM_LD_REF_L1_LSU0 0x80 PMC
+UMASK_PM_LD_REF_L1_LSU0 0x0C
+
+EVENT_PM_LD_REF_L1_LSU1 0x82 PMC
+UMASK_PM_LD_REF_L1_LSU1 0x0C
+
+EVENT_PM_LD_REF_L1_LSU2 0x94 PMC
+UMASK_PM_LD_REF_L1_LSU2 0x0C
+
+EVENT_PM_LD_REF_L1_LSU3 0x96 PMC
+UMASK_PM_LD_REF_L1_LSU3 0x0C
+
+EVENT_PM_LINK_STACK_INVALID_PTR 0x9A PMC
+UMASK_PM_LINK_STACK_INVALID_PTR 0x05
+
+EVENT_PM_LINK_STACK_WRONG_ADD_PRED 0x98 PMC
+UMASK_PM_LINK_STACK_WRONG_ADD_PRED 0x05
+
+EVENT_PM_LS0_ERAT_MISS_PREF 0x80 PMC
+UMASK_PM_LS0_ERAT_MISS_PREF 0x0E
+
+EVENT_PM_LS0_L1_PREF 0xB8 PMC
+UMASK_PM_LS0_L1_PREF 0x0D
+
+EVENT_PM_LS0_L1_SW_PREF 0x98 PMC
+UMASK_PM_LS0_L1_SW_PREF 0x0C
+
+EVENT_PM_LS1_ERAT_MISS_PREF 0x82 PMC
+UMASK_PM_LS1_ERAT_MISS_PREF 0x0E
+
+EVENT_PM_LS1_L1_PREF 0xBA PMC
+UMASK_PM_LS1_L1_PREF 0x0D
+
+EVENT_PM_LS1_L1_SW_PREF 0x9A PMC
+UMASK_PM_LS1_L1_SW_PREF 0x0C
+
+EVENT_PM_LSU0_FLUSH_LRQ 0xB0 PMC
+UMASK_PM_LSU0_FLUSH_LRQ 0x0C
+
+EVENT_PM_LSU0_FLUSH_SRQ 0xB8 PMC
+UMASK_PM_LSU0_FLUSH_SRQ 0x0C
+
+EVENT_PM_LSU0_FLUSH_ULD 0xA4 PMC
+UMASK_PM_LSU0_FLUSH_ULD 0x0C
+
+EVENT_PM_LSU0_FLUSH_UST 0xAC PMC
+UMASK_PM_LSU0_FLUSH_UST 0x0C
+
+EVENT_PM_LSU0_L1_CAM_CANCEL 0x88 PMC
+UMASK_PM_LSU0_L1_CAM_CANCEL 0x0F
+
+EVENT_PM_LSU0_LARX_FIN 0x56 PMC0
+UMASK_PM_LSU0_LARX_FIN 0x0E
+
+EVENT_PM_LSU0_LMQ_LHR_MERGE 0x8C PMC
+UMASK_PM_LSU0_LMQ_LHR_MERGE 0x0D
+
+EVENT_PM_LSU0_NCLD 0x8C PMC
+UMASK_PM_LSU0_NCLD 0x0C
+
+EVENT_PM_LSU0_PRIMARY_ERAT_HIT 0x90 PMC
+UMASK_PM_LSU0_PRIMARY_ERAT_HIT 0x0E
+
+EVENT_PM_LSU0_REJECT 0x5A PMC0
+UMASK_PM_LSU0_REJECT 0x0E
+
+EVENT_PM_LSU0_SRQ_STFWD 0x9C PMC
+UMASK_PM_LSU0_SRQ_STFWD 0x0C
+
+EVENT_PM_LSU0_STORE_REJECT 0x84 PMC
+UMASK_PM_LSU0_STORE_REJECT 0x0F
+
+EVENT_PM_LSU0_TMA_REQ_L2 0xA8 PMC
+UMASK_PM_LSU0_TMA_REQ_L2 0x0E
+
+EVENT_PM_LSU0_TM_L1_HIT 0x98 PMC
+UMASK_PM_LSU0_TM_L1_HIT 0x0E
+
+EVENT_PM_LSU0_TM_L1_MISS 0xA0 PMC
+UMASK_PM_LSU0_TM_L1_MISS 0x0E
+
+EVENT_PM_LSU1_FLUSH_LRQ 0xB2 PMC
+UMASK_PM_LSU1_FLUSH_LRQ 0x0C
+
+EVENT_PM_LSU1_FLUSH_SRQ 0xBA PMC
+UMASK_PM_LSU1_FLUSH_SRQ 0x0C
+
+EVENT_PM_LSU1_FLUSH_ULD 0xA6 PMC
+UMASK_PM_LSU1_FLUSH_ULD 0x0C
+
+EVENT_PM_LSU1_FLUSH_UST 0xAE PMC
+UMASK_PM_LSU1_FLUSH_UST 0x0C
+
+EVENT_PM_LSU1_L1_CAM_CANCEL 0x8A PMC
+UMASK_PM_LSU1_L1_CAM_CANCEL 0x0F
+
+EVENT_PM_LSU1_LARX_FIN 0x56 PMC1
+UMASK_PM_LSU1_LARX_FIN 0x0E
+
+EVENT_PM_LSU1_LMQ_LHR_MERGE 0x8E PMC
+UMASK_PM_LSU1_LMQ_LHR_MERGE 0x0D
+
+EVENT_PM_LSU1_NCLD 0x8E PMC
+UMASK_PM_LSU1_NCLD 0x0C
+
+EVENT_PM_LSU1_PRIMARY_ERAT_HIT 0x92 PMC
+UMASK_PM_LSU1_PRIMARY_ERAT_HIT 0x0E
+
+EVENT_PM_LSU1_REJECT 0x5A PMC1
+UMASK_PM_LSU1_REJECT 0x0E
+
+EVENT_PM_LSU1_SRQ_STFWD 0x9E PMC
+UMASK_PM_LSU1_SRQ_STFWD 0x0C
+
+EVENT_PM_LSU1_STORE_REJECT 0x86 PMC
+UMASK_PM_LSU1_STORE_REJECT 0x0F
+
+EVENT_PM_LSU1_TMA_REQ_L2 0xAA PMC
+UMASK_PM_LSU1_TMA_REQ_L2 0x0E
+
+EVENT_PM_LSU1_TM_L1_HIT 0x9A PMC
+UMASK_PM_LSU1_TM_L1_HIT 0x0E
+
+EVENT_PM_LSU1_TM_L1_MISS 0xA2 PMC
+UMASK_PM_LSU1_TM_L1_MISS 0x0E
+
+EVENT_PM_LSU2_FLUSH_LRQ 0xB4 PMC
+UMASK_PM_LSU2_FLUSH_LRQ 0x0C
+
+EVENT_PM_LSU2_FLUSH_SRQ 0xBC PMC
+UMASK_PM_LSU2_FLUSH_SRQ 0x0C
+
+EVENT_PM_LSU2_FLUSH_ULD 0xA8 PMC
+UMASK_PM_LSU2_FLUSH_ULD 0x0C
+
+EVENT_PM_LSU2_L1_CAM_CANCEL 0x8C PMC
+UMASK_PM_LSU2_L1_CAM_CANCEL 0x0F
+
+EVENT_PM_LSU2_LARX_FIN 0x56 PMC2
+UMASK_PM_LSU2_LARX_FIN 0x0E
+
+EVENT_PM_LSU2_LDF 0x84 PMC
+UMASK_PM_LSU2_LDF 0x0C
+
+EVENT_PM_LSU2_LDX 0x88 PMC
+UMASK_PM_LSU2_LDX 0x0C
+
+EVENT_PM_LSU2_LMQ_LHR_MERGE 0x90 PMC
+UMASK_PM_LSU2_LMQ_LHR_MERGE 0x0D
+
+EVENT_PM_LSU2_PRIMARY_ERAT_HIT 0x94 PMC
+UMASK_PM_LSU2_PRIMARY_ERAT_HIT 0x0E
+
+EVENT_PM_LSU2_REJECT 0x5A PMC2
+UMASK_PM_LSU2_REJECT 0x0E
+
+EVENT_PM_LSU2_SRQ_STFWD 0xA0 PMC
+UMASK_PM_LSU2_SRQ_STFWD 0x0C
+
+EVENT_PM_LSU2_TMA_REQ_L2 0xAC PMC
+UMASK_PM_LSU2_TMA_REQ_L2 0x0E
+
+EVENT_PM_LSU2_TM_L1_HIT 0x9C PMC
+UMASK_PM_LSU2_TM_L1_HIT 0x0E
+
+EVENT_PM_LSU2_TM_L1_MISS 0xA4 PMC
+UMASK_PM_LSU2_TM_L1_MISS 0x0E
+
+EVENT_PM_LSU3_FLUSH_LRQ 0xB6 PMC
+UMASK_PM_LSU3_FLUSH_LRQ 0x0C
+
+EVENT_PM_LSU3_FLUSH_SRQ 0xBE PMC
+UMASK_PM_LSU3_FLUSH_SRQ 0x0C
+
+EVENT_PM_LSU3_FLUSH_ULD 0xAA PMC
+UMASK_PM_LSU3_FLUSH_ULD 0x0C
+
+EVENT_PM_LSU3_L1_CAM_CANCEL 0x8E PMC
+UMASK_PM_LSU3_L1_CAM_CANCEL 0x0F
+
+EVENT_PM_LSU3_LARX_FIN 0x56 PMC3
+UMASK_PM_LSU3_LARX_FIN 0x0E
+
+EVENT_PM_LSU3_LDF 0x86 PMC
+UMASK_PM_LSU3_LDF 0x0C
+
+EVENT_PM_LSU3_LDX 0x8A PMC
+UMASK_PM_LSU3_LDX 0x0C
+
+EVENT_PM_LSU3_LMQ_LHR_MERGE 0x92 PMC
+UMASK_PM_LSU3_LMQ_LHR_MERGE 0x0D
+
+EVENT_PM_LSU3_PRIMARY_ERAT_HIT 0x96 PMC
+UMASK_PM_LSU3_PRIMARY_ERAT_HIT 0x0E
+
+EVENT_PM_LSU3_REJECT 0x5A PMC3
+UMASK_PM_LSU3_REJECT 0x0E
+
+EVENT_PM_LSU3_SRQ_STFWD 0xA2 PMC
+UMASK_PM_LSU3_SRQ_STFWD 0x0C
+
+EVENT_PM_LSU3_TMA_REQ_L2 0xAE PMC
+UMASK_PM_LSU3_TMA_REQ_L2 0x0E
+
+EVENT_PM_LSU3_TM_L1_HIT 0x9E PMC
+UMASK_PM_LSU3_TM_L1_HIT 0x0E
+
+EVENT_PM_LSU3_TM_L1_MISS 0xA6 PMC
+UMASK_PM_LSU3_TM_L1_MISS 0x0E
+
+EVENT_PM_LSU_DERAT_MISS 0xF6 PMC1
+UMASK_PM_LSU_DERAT_MISS 0x00
+
+EVENT_PM_LSU_ERAT_MISS_PREF 0x80 PMC
+UMASK_PM_LSU_ERAT_MISS_PREF 0x0E 0x10 0x00
+
+EVENT_PM_LSU_FIN 0x66 PMC2
+UMASK_PM_LSU_FIN 0x00
+
+EVENT_PM_LSU_FLUSH_UST 0xAC PMC
+UMASK_PM_LSU_FLUSH_UST 0x0C 0x10 0x00
+
+EVENT_PM_LSU_FOUR_TABLEWALK_CYC 0xA4 PMC
+UMASK_PM_LSU_FOUR_TABLEWALK_CYC 0x0D
+
+EVENT_PM_LSU_FX_FIN 0x66 PMC0
+UMASK_PM_LSU_FX_FIN 0x00
+
+EVENT_PM_LSU_L1_PREF 0xB8 PMC
+UMASK_PM_LSU_L1_PREF 0x0D 0x10 0x00
+
+EVENT_PM_LSU_L1_SW_PREF 0x98 PMC
+UMASK_PM_LSU_L1_SW_PREF 0x0C 0x10 0x00
+
+EVENT_PM_LSU_LDF 0x84 PMC
+UMASK_PM_LSU_LDF 0x0C 0x10 0x00
+
+EVENT_PM_LSU_LDX 0x88 PMC
+UMASK_PM_LSU_LDX 0x0C 0x10 0x00
+
+EVENT_PM_LSU_LMQ_FULL_CYC 0xA2 PMC
+UMASK_PM_LSU_LMQ_FULL_CYC 0x0D
+
+EVENT_PM_LSU_LMQ_S0_ALLOC 0xA1 PMC
+UMASK_PM_LSU_LMQ_S0_ALLOC 0x0D
+
+EVENT_PM_LSU_LMQ_S0_VALID 0xA0 PMC
+UMASK_PM_LSU_LMQ_S0_VALID 0x0D
+
+EVENT_PM_LSU_LMQ_SRQ_EMPTY_ALL_CYC 0x1C PMC2
+UMASK_PM_LSU_LMQ_SRQ_EMPTY_ALL_CYC 0x00
+
+EVENT_PM_LSU_LMQ_SRQ_EMPTY_CYC 0x3E PMC1
+UMASK_PM_LSU_LMQ_SRQ_EMPTY_CYC 0x00
+
+EVENT_PM_LSU_LRQ_S0_ALLOC 0x9F PMC
+UMASK_PM_LSU_LRQ_S0_ALLOC 0x0D
+
+EVENT_PM_LSU_LRQ_S0_VALID 0x9E PMC
+UMASK_PM_LSU_LRQ_S0_VALID 0x0D
+
+EVENT_PM_LSU_LRQ_S43_ALLOC 0x91 PMC
+UMASK_PM_LSU_LRQ_S43_ALLOC 0x0F
+
+EVENT_PM_LSU_LRQ_S43_VALID 0x90 PMC
+UMASK_PM_LSU_LRQ_S43_VALID 0x0F
+
+EVENT_PM_LSU_MRK_DERAT_MISS 0x62 PMC2
+UMASK_PM_LSU_MRK_DERAT_MISS 0x00 0x20 0x00
+
+EVENT_PM_LSU_NCLD 0x8C PMC
+UMASK_PM_LSU_NCLD 0x0C 0x10 0x00
+
+EVENT_PM_LSU_NCST 0x92 PMC
+UMASK_PM_LSU_NCST 0x0C
+
+EVENT_PM_LSU_REJECT 0x64 PMC0
+UMASK_PM_LSU_REJECT 0x00
+
+EVENT_PM_LSU_REJECT_ERAT_MISS 0x5C PMC1
+UMASK_PM_LSU_REJECT_ERAT_MISS 0x0E
+
+EVENT_PM_LSU_REJECT_LHS 0x5C PMC3
+UMASK_PM_LSU_REJECT_LHS 0x0E
+
+EVENT_PM_LSU_REJECT_LMQ_FULL 0x5C PMC0
+UMASK_PM_LSU_REJECT_LMQ_FULL 0x0E
+
+EVENT_PM_LSU_SET_MPRED 0x82 PMC
+UMASK_PM_LSU_SET_MPRED 0x0D
+
+EVENT_PM_LSU_SRQ_EMPTY_CYC 0x08 PMC3
+UMASK_PM_LSU_SRQ_EMPTY_CYC 0x00
+
+EVENT_PM_LSU_SRQ_FULL_CYC 0x1A PMC0
+UMASK_PM_LSU_SRQ_FULL_CYC 0x00
+
+EVENT_PM_LSU_SRQ_S0_ALLOC 0x9D PMC
+UMASK_PM_LSU_SRQ_S0_ALLOC 0x0D
+
+EVENT_PM_LSU_SRQ_S0_VALID 0x9C PMC
+UMASK_PM_LSU_SRQ_S0_VALID 0x0D
+
+EVENT_PM_LSU_SRQ_S39_ALLOC 0x93 PMC
+UMASK_PM_LSU_SRQ_S39_ALLOC 0x0F
+
+EVENT_PM_LSU_SRQ_S39_VALID 0x92 PMC
+UMASK_PM_LSU_SRQ_S39_VALID 0x0F
+
+EVENT_PM_LSU_SRQ_SYNC 0x9B PMC
+UMASK_PM_LSU_SRQ_SYNC 0x0D
+
+EVENT_PM_LSU_SRQ_SYNC_CYC 0x9A PMC
+UMASK_PM_LSU_SRQ_SYNC_CYC 0x0D
+
+EVENT_PM_LSU_STORE_REJECT 0x84 PMC
+UMASK_PM_LSU_STORE_REJECT 0x0F
+
+EVENT_PM_LSU_TWO_TABLEWALK_CYC 0xA6 PMC
+UMASK_PM_LSU_TWO_TABLEWALK_CYC 0x0D
+
+EVENT_PM_LWSYNC 0x94 PMC
+UMASK_PM_LWSYNC 0x05
+
+EVENT_PM_LWSYNC_ALT 0x98 PMC
+UMASK_PM_LWSYNC_ALT 0x0D
+
+EVENT_PM_LWSYNC_HELD 0x9A PMC
+UMASK_PM_LWSYNC_HELD 0x02
+
+EVENT_PM_MEM_CO 0x58 PMC3
+UMASK_PM_MEM_CO 0x0C
+
+EVENT_PM_MEM_LOC_THRESH_IFU 0x58 PMC0
+UMASK_PM_MEM_LOC_THRESH_IFU 0x00
+
+EVENT_PM_MEM_LOC_THRESH_LSU_HIGH 0x56 PMC3
+UMASK_PM_MEM_LOC_THRESH_LSU_HIGH 0x00
+
+EVENT_PM_MEM_LOC_THRESH_LSU_MED 0x5E PMC0
+UMASK_PM_MEM_LOC_THRESH_LSU_MED 0x0C
+
+EVENT_PM_MEM_PREF 0x58 PMC1
+UMASK_PM_MEM_PREF 0x0C
+
+EVENT_PM_MEM_READ 0x56 PMC0
+UMASK_PM_MEM_READ 0x00
+
+EVENT_PM_MEM_RWITM 0x5E PMC2
+UMASK_PM_MEM_RWITM 0x0C
+
+EVENT_PM_MRK_BACK_BR_CMPL 0x5E PMC2
+UMASK_PM_MRK_BACK_BR_CMPL 0x05 0x20 0x00
+
+EVENT_PM_MRK_BRU_FIN 0x3A PMC1
+UMASK_PM_MRK_BRU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_BR_CMPL 0x6E PMC0
+UMASK_PM_MRK_BR_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_BR_MPRED_CMPL 0xE4 PMC2
+UMASK_PM_MRK_BR_MPRED_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_BR_TAKEN_CMPL 0xE2 PMC0
+UMASK_PM_MRK_BR_TAKEN_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_CRU_FIN 0x3A PMC2
+UMASK_PM_MRK_CRU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_MRK_DATA_FROM_DL2L3_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL2L3_MOD_CYC 0x28 PMC1
+UMASK_PM_MRK_DATA_FROM_DL2L3_MOD_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_MRK_DATA_FROM_DL2L3_SHR 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL2L3_SHR_CYC 0x28 PMC1
+UMASK_PM_MRK_DATA_FROM_DL2L3_SHR_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL4 0x4C PMC2
+UMASK_PM_MRK_DATA_FROM_DL4 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DL4_CYC 0x2C PMC1
+UMASK_PM_MRK_DATA_FROM_DL4_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DMEM 0x4C PMC3
+UMASK_PM_MRK_DATA_FROM_DMEM 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_DMEM_CYC 0x2C PMC1
+UMASK_PM_MRK_DATA_FROM_DMEM_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2 0x42 PMC0
+UMASK_PM_MRK_DATA_FROM_L2 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_MRK_DATA_FROM_L21_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L21_MOD_CYC 0x26 PMC1
+UMASK_PM_MRK_DATA_FROM_L21_MOD_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_MRK_DATA_FROM_L21_SHR 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L21_SHR_CYC 0x26 PMC1
+UMASK_PM_MRK_DATA_FROM_L21_SHR_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2MISS 0x4E PMC0
+UMASK_PM_MRK_DATA_FROM_L2MISS 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2MISS_ALT 0xE8 PMC3
+UMASK_PM_MRK_DATA_FROM_L2MISS_ALT 0x00 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2MISS_CYC 0x2E PMC3
+UMASK_PM_MRK_DATA_FROM_L2MISS_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_CYC 0x22 PMC3
+UMASK_PM_MRK_DATA_FROM_L2_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC 0x20 PMC1
+UMASK_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC 0x20 PMC1
+UMASK_PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_MRK_DATA_FROM_L2_MEPF 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_MEPF_CYC 0x20 PMC3
+UMASK_PM_MRK_DATA_FROM_L2_MEPF_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_MRK_DATA_FROM_L2_NO_CONFLICT 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC 0x20 PMC3
+UMASK_PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3 0x42 PMC3
+UMASK_PM_MRK_DATA_FROM_L3 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_MRK_DATA_FROM_L31_ECO_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_ECO_MOD_CYC 0x24 PMC1
+UMASK_PM_MRK_DATA_FROM_L31_ECO_MOD_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_MRK_DATA_FROM_L31_ECO_SHR 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_ECO_SHR_CYC 0x24 PMC1
+UMASK_PM_MRK_DATA_FROM_L31_ECO_SHR_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_MRK_DATA_FROM_L31_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_MOD_CYC 0x24 PMC3
+UMASK_PM_MRK_DATA_FROM_L31_MOD_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_MRK_DATA_FROM_L31_SHR 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L31_SHR_CYC 0x26 PMC3
+UMASK_PM_MRK_DATA_FROM_L31_SHR_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3MISS 0xE4 PMC1
+UMASK_PM_MRK_DATA_FROM_L3MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3MISS_ALT 0x4E PMC3
+UMASK_PM_MRK_DATA_FROM_L3MISS_ALT 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3MISS_CYC 0x2E PMC1
+UMASK_PM_MRK_DATA_FROM_L3MISS_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_CYC 0x22 PMC1
+UMASK_PM_MRK_DATA_FROM_L3_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_MRK_DATA_FROM_L3_DISP_CONFLICT 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC 0x22 PMC1
+UMASK_PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_MRK_DATA_FROM_L3_MEPF 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_MEPF_CYC 0x22 PMC3
+UMASK_PM_MRK_DATA_FROM_L3_MEPF_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_MRK_DATA_FROM_L3_NO_CONFLICT 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC 0x24 PMC3
+UMASK_PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_LL4 0x4C PMC0
+UMASK_PM_MRK_DATA_FROM_LL4 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_LL4_CYC 0x2C PMC3
+UMASK_PM_MRK_DATA_FROM_LL4_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_LMEM 0x48 PMC1
+UMASK_PM_MRK_DATA_FROM_LMEM 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_LMEM_CYC 0x28 PMC3
+UMASK_PM_MRK_DATA_FROM_LMEM_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_MEM 0xE0 PMC1
+UMASK_PM_MRK_DATA_FROM_MEM 0x00 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_MEMORY 0x4C PMC1
+UMASK_PM_MRK_DATA_FROM_MEMORY 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_MEMORY_CYC 0x2C PMC3
+UMASK_PM_MRK_DATA_FROM_MEMORY_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_MRK_DATA_FROM_OFF_CHIP_CACHE 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC 0x2A PMC1
+UMASK_PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_MRK_DATA_FROM_ON_CHIP_CACHE 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC 0x28 PMC3
+UMASK_PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_MRK_DATA_FROM_RL2L3_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL2L3_MOD_CYC 0x26 PMC3
+UMASK_PM_MRK_DATA_FROM_RL2L3_MOD_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_MRK_DATA_FROM_RL2L3_SHR 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL2L3_SHR_CYC 0x2A PMC3
+UMASK_PM_MRK_DATA_FROM_RL2L3_SHR_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL4 0x4A PMC1
+UMASK_PM_MRK_DATA_FROM_RL4 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RL4_CYC 0x2A PMC3
+UMASK_PM_MRK_DATA_FROM_RL4_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RMEM 0x4A PMC2
+UMASK_PM_MRK_DATA_FROM_RMEM 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DATA_FROM_RMEM_CYC 0x2A PMC1
+UMASK_PM_MRK_DATA_FROM_RMEM_CYC 0x0C 0x20 0x00
+
+EVENT_PM_MRK_DCACHE_RELOAD_INTV 0x18 PMC3
+UMASK_PM_MRK_DCACHE_RELOAD_INTV 0x00 0x20 0x00
+
+EVENT_PM_MRK_DERAT_MISS 0xE6 PMC2
+UMASK_PM_MRK_DERAT_MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_DERAT_MISS_16G 0x54 PMC3
+UMASK_PM_MRK_DERAT_MISS_16G 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DERAT_MISS_16M 0x54 PMC2
+UMASK_PM_MRK_DERAT_MISS_16M 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DERAT_MISS_4K 0x56 PMC0
+UMASK_PM_MRK_DERAT_MISS_4K 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DERAT_MISS_64K 0x54 PMC1
+UMASK_PM_MRK_DERAT_MISS_64K 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DFU_FIN 0x32 PMC1
+UMASK_PM_MRK_DFU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_MRK_DPTEG_FROM_DL2L3_MOD 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_MRK_DPTEG_FROM_DL2L3_SHR 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_DL4 0x4C PMC2
+UMASK_PM_MRK_DPTEG_FROM_DL4 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_DMEM 0x4C PMC3
+UMASK_PM_MRK_DPTEG_FROM_DMEM 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2 0x42 PMC0
+UMASK_PM_MRK_DPTEG_FROM_L2 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_MRK_DPTEG_FROM_L21_MOD 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_MRK_DPTEG_FROM_L21_SHR 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2MISS 0x4E PMC0
+UMASK_PM_MRK_DPTEG_FROM_L2MISS 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_LDHITST 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_MRK_DPTEG_FROM_L2_DISP_CONFLICT_OTHER 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_MRK_DPTEG_FROM_L2_MEPF 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_MRK_DPTEG_FROM_L2_NO_CONFLICT 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L3 0x42 PMC3
+UMASK_PM_MRK_DPTEG_FROM_L3 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_MRK_DPTEG_FROM_L31_ECO_MOD 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_MRK_DPTEG_FROM_L31_ECO_SHR 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_MRK_DPTEG_FROM_L31_MOD 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_MRK_DPTEG_FROM_L31_SHR 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L3MISS 0x4E PMC3
+UMASK_PM_MRK_DPTEG_FROM_L3MISS 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_MRK_DPTEG_FROM_L3_MEPF 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_MRK_DPTEG_FROM_L3_NO_CONFLICT 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_LL4 0x4C PMC0
+UMASK_PM_MRK_DPTEG_FROM_LL4 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_LMEM 0x48 PMC1
+UMASK_PM_MRK_DPTEG_FROM_LMEM 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_MEMORY 0x4C PMC1
+UMASK_PM_MRK_DPTEG_FROM_MEMORY 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_MRK_DPTEG_FROM_ON_CHIP_CACHE 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_MRK_DPTEG_FROM_RL2L3_MOD 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_MRK_DPTEG_FROM_RL2L3_SHR 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_RL4 0x4A PMC1
+UMASK_PM_MRK_DPTEG_FROM_RL4 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DPTEG_FROM_RMEM 0x4A PMC2
+UMASK_PM_MRK_DPTEG_FROM_RMEM 0x0F 0x20 0x00
+
+EVENT_PM_MRK_DTLB_MISS 0xE4 PMC3
+UMASK_PM_MRK_DTLB_MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_DTLB_MISS_16G 0x58 PMC0
+UMASK_PM_MRK_DTLB_MISS_16G 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DTLB_MISS_16M 0x56 PMC3
+UMASK_PM_MRK_DTLB_MISS_16M 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DTLB_MISS_4K 0x56 PMC1
+UMASK_PM_MRK_DTLB_MISS_4K 0x0D 0x20 0x00
+
+EVENT_PM_MRK_DTLB_MISS_64K 0x56 PMC2
+UMASK_PM_MRK_DTLB_MISS_64K 0x0D 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_BKILL 0x54 PMC3
+UMASK_PM_MRK_FAB_RSP_BKILL 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_BKILL_CYC 0x50 PMC1
+UMASK_PM_MRK_FAB_RSP_BKILL_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_CLAIM_RTY 0x5E PMC2
+UMASK_PM_MRK_FAB_RSP_CLAIM_RTY 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_DCLAIM 0x54 PMC2
+UMASK_PM_MRK_FAB_RSP_DCLAIM 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_DCLAIM_CYC 0x52 PMC1
+UMASK_PM_MRK_FAB_RSP_DCLAIM_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_MATCH 0x56 PMC2
+UMASK_PM_MRK_FAB_RSP_MATCH 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_MATCH_CYC 0x52 PMC3
+UMASK_PM_MRK_FAB_RSP_MATCH_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_RD_RTY 0x5E PMC3
+UMASK_PM_MRK_FAB_RSP_RD_RTY 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_RD_T_INTV 0x5E PMC0
+UMASK_PM_MRK_FAB_RSP_RD_T_INTV 0x00 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_RWITM_CYC 0x50 PMC3
+UMASK_PM_MRK_FAB_RSP_RWITM_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_FAB_RSP_RWITM_RTY 0x5E PMC1
+UMASK_PM_MRK_FAB_RSP_RWITM_RTY 0x00 0x20 0x00
+
+EVENT_PM_MRK_FILT_MATCH 0x3C PMC1
+UMASK_PM_MRK_FILT_MATCH 0x00 0x20 0x00
+
+EVENT_PM_MRK_FIN_STALL_CYC 0x3C PMC0
+UMASK_PM_MRK_FIN_STALL_CYC 0x00 0x20 0x00
+
+EVENT_PM_MRK_FXU_FIN 0x34 PMC1
+UMASK_PM_MRK_FXU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_GRP_CMPL 0x30 PMC3
+UMASK_PM_MRK_GRP_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_GRP_IC_MISS 0x3A PMC3
+UMASK_PM_MRK_GRP_IC_MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_GRP_NTC 0x3C PMC2
+UMASK_PM_MRK_GRP_NTC 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_CMPL 0xE0 PMC3
+UMASK_PM_MRK_INST_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_DECODED 0x30 PMC1
+UMASK_PM_MRK_INST_DECODED 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_DISP 0xE0 PMC0
+UMASK_PM_MRK_INST_DISP 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_FIN 0x30 PMC2
+UMASK_PM_MRK_INST_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_FROM_L3MISS 0xE6 PMC3
+UMASK_PM_MRK_INST_FROM_L3MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_ISSUED 0x32 PMC0
+UMASK_PM_MRK_INST_ISSUED 0x00 0x20 0x00
+
+EVENT_PM_MRK_INST_TIMEO 0x34 PMC3
+UMASK_PM_MRK_INST_TIMEO 0x00 0x20 0x00
+
+EVENT_PM_MRK_L1_ICACHE_MISS 0xE4 PMC0
+UMASK_PM_MRK_L1_ICACHE_MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_L1_RELOAD_VALID 0xEA PMC0
+UMASK_PM_MRK_L1_RELOAD_VALID 0x00 0x20 0x00
+
+EVENT_PM_MRK_L2_RC_DISP 0x14 PMC1
+UMASK_PM_MRK_L2_RC_DISP 0x00 0x20 0x00
+
+EVENT_PM_MRK_L2_RC_DONE 0x2A PMC2
+UMASK_PM_MRK_L2_RC_DONE 0x00 0x20 0x00
+
+EVENT_PM_MRK_LARX_FIN 0x16 PMC3
+UMASK_PM_MRK_LARX_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_LD_MISS_EXPOSED 0x3F PMC0
+UMASK_PM_MRK_LD_MISS_EXPOSED 0x00 0x20 0x00
+
+EVENT_PM_MRK_LD_MISS_EXPOSED_CYC 0x3E PMC0
+UMASK_PM_MRK_LD_MISS_EXPOSED_CYC 0x00 0x20 0x00
+
+EVENT_PM_MRK_LD_MISS_L1 0xE2 PMC1
+UMASK_PM_MRK_LD_MISS_L1 0x00 0x20 0x00
+
+EVENT_PM_MRK_LD_MISS_L1_CYC 0x3E PMC3
+UMASK_PM_MRK_LD_MISS_L1_CYC 0x00 0x20 0x00
+
+EVENT_PM_MRK_LSU_FIN 0x32 PMC3
+UMASK_PM_MRK_LSU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_LSU_FLUSH 0x80 PMC
+UMASK_PM_MRK_LSU_FLUSH 0x0D 0x20 0x00
+
+EVENT_PM_MRK_LSU_FLUSH_LRQ 0x88 PMC
+UMASK_PM_MRK_LSU_FLUSH_LRQ 0x0D 0x20 0x00
+
+EVENT_PM_MRK_LSU_FLUSH_SRQ 0x8A PMC
+UMASK_PM_MRK_LSU_FLUSH_SRQ 0x0D 0x20 0x00
+
+EVENT_PM_MRK_LSU_FLUSH_ULD 0x84 PMC
+UMASK_PM_MRK_LSU_FLUSH_ULD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_LSU_FLUSH_UST 0x86 PMC
+UMASK_PM_MRK_LSU_FLUSH_UST 0x0D 0x20 0x00
+
+EVENT_PM_MRK_LSU_REJECT 0x64 PMC3
+UMASK_PM_MRK_LSU_REJECT 0x00 0x20 0x00
+
+EVENT_PM_MRK_LSU_REJECT_ERAT_MISS 0x64 PMC2
+UMASK_PM_MRK_LSU_REJECT_ERAT_MISS 0x00 0x20 0x00
+
+EVENT_PM_MRK_NTF_FIN 0x12 PMC1
+UMASK_PM_MRK_NTF_FIN 0x00 0x20 0x00
+
+EVENT_PM_MRK_RUN_CYC 0x5E PMC0
+UMASK_PM_MRK_RUN_CYC 0x0D 0x20 0x00
+
+EVENT_PM_MRK_SRC_PREF_TRACK_EFF 0x5A PMC0
+UMASK_PM_MRK_SRC_PREF_TRACK_EFF 0x0D 0x20 0x00
+
+EVENT_PM_MRK_SRC_PREF_TRACK_INEFF 0x5A PMC2
+UMASK_PM_MRK_SRC_PREF_TRACK_INEFF 0x0D 0x20 0x00
+
+EVENT_PM_MRK_SRC_PREF_TRACK_MOD 0x5C PMC3
+UMASK_PM_MRK_SRC_PREF_TRACK_MOD 0x0D 0x20 0x00
+
+EVENT_PM_MRK_SRC_PREF_TRACK_MOD_L2 0x5C PMC0
+UMASK_PM_MRK_SRC_PREF_TRACK_MOD_L2 0x0D 0x20 0x00
+
+EVENT_PM_MRK_SRC_PREF_TRACK_MOD_L3 0x5C PMC2
+UMASK_PM_MRK_SRC_PREF_TRACK_MOD_L3 0x0D 0x20 0x00
+
+EVENT_PM_MRK_STALL_CMPLU_CYC 0x3E PMC2
+UMASK_PM_MRK_STALL_CMPLU_CYC 0x00 0x20 0x00
+
+EVENT_PM_MRK_STCX_FAIL 0x58 PMC2
+UMASK_PM_MRK_STCX_FAIL 0x0E 0x20 0x00
+
+EVENT_PM_MRK_ST_CMPL 0x34 PMC0
+UMASK_PM_MRK_ST_CMPL 0x00 0x20 0x00
+
+EVENT_PM_MRK_ST_CMPL_INT 0x34 PMC2
+UMASK_PM_MRK_ST_CMPL_INT 0x00 0x20 0x00
+
+EVENT_PM_MRK_ST_DRAIN_TO_L2DISP_CYC 0x50 PMC2
+UMASK_PM_MRK_ST_DRAIN_TO_L2DISP_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_ST_FWD 0x2C PMC2
+UMASK_PM_MRK_ST_FWD 0x00 0x20 0x00
+
+EVENT_PM_MRK_ST_L2DISP_TO_CMPL_CYC 0x50 PMC0
+UMASK_PM_MRK_ST_L2DISP_TO_CMPL_CYC 0x0F 0x20 0x00
+
+EVENT_PM_MRK_ST_NEST 0x38 PMC1
+UMASK_PM_MRK_ST_NEST 0x00 0x20 0x00
+
+EVENT_PM_MRK_TGT_PREF_TRACK_EFF 0x5A PMC0
+UMASK_PM_MRK_TGT_PREF_TRACK_EFF 0x0C 0x20 0x00
+
+EVENT_PM_MRK_TGT_PREF_TRACK_INEFF 0x5A PMC2
+UMASK_PM_MRK_TGT_PREF_TRACK_INEFF 0x0C 0x20 0x00
+
+EVENT_PM_MRK_TGT_PREF_TRACK_MOD 0x5C PMC3
+UMASK_PM_MRK_TGT_PREF_TRACK_MOD 0x0C 0x20 0x00
+
+EVENT_PM_MRK_TGT_PREF_TRACK_MOD_L2 0x5C PMC0
+UMASK_PM_MRK_TGT_PREF_TRACK_MOD_L2 0x0C 0x20 0x00
+
+EVENT_PM_MRK_TGT_PREF_TRACK_MOD_L3 0x5C PMC2
+UMASK_PM_MRK_TGT_PREF_TRACK_MOD_L3 0x0C 0x20 0x00
+
+EVENT_PM_MRK_VSU_FIN 0x32 PMC2
+UMASK_PM_MRK_VSU_FIN 0x00 0x20 0x00
+
+EVENT_PM_MULT_MRK 0x5E PMC2
+UMASK_PM_MULT_MRK 0x0D 0x20 0x00
+
+EVENT_PM_NESTED_TEND 0xB0 PMC
+UMASK_PM_NESTED_TEND 0x02
+
+EVENT_PM_NEST_REF_CLK 0x6E PMC2
+UMASK_PM_NEST_REF_CLK 0x00
+
+EVENT_PM_NON_FAV_TBEGIN 0xB6 PMC
+UMASK_PM_NON_FAV_TBEGIN 0x02
+
+EVENT_PM_NTCG_ALL_FIN 0x1A PMC1
+UMASK_PM_NTCG_ALL_FIN 0x00
+
+EVENT_PM_OUTER_TBEGIN 0xAC PMC
+UMASK_PM_OUTER_TBEGIN 0x02
+
+EVENT_PM_OUTER_TEND 0xAE PMC
+UMASK_PM_OUTER_TEND 0x02
+
+EVENT_PM_PMC1_OVERFLOW 0x10 PMC1
+UMASK_PM_PMC1_OVERFLOW 0x00
+
+EVENT_PM_PMC2_OVERFLOW 0x10 PMC2
+UMASK_PM_PMC2_OVERFLOW 0x00
+
+EVENT_PM_PMC2_REWIND 0x20 PMC2
+UMASK_PM_PMC2_REWIND 0x00
+
+EVENT_PM_PMC2_SAVED 0x22 PMC0
+UMASK_PM_PMC2_SAVED 0x00
+
+EVENT_PM_PMC3_OVERFLOW 0x10 PMC3
+UMASK_PM_PMC3_OVERFLOW 0x00
+
+EVENT_PM_PMC4_OVERFLOW 0x10 PMC0
+UMASK_PM_PMC4_OVERFLOW 0x00
+
+EVENT_PM_PMC4_REWIND 0x20 PMC0
+UMASK_PM_PMC4_REWIND 0x00
+
+EVENT_PM_PMC4_SAVED 0x22 PMC2
+UMASK_PM_PMC4_SAVED 0x00
+
+EVENT_PM_PMC5_OVERFLOW 0x24 PMC0
+UMASK_PM_PMC5_OVERFLOW 0x00
+
+EVENT_PM_PMC6_OVERFLOW 0x24 PMC2
+UMASK_PM_PMC6_OVERFLOW 0x00
+
+EVENT_PM_PREF_TRACKED 0x5A PMC1
+UMASK_PM_PREF_TRACKED 0x00
+
+EVENT_PM_PREF_TRACK_EFF 0x5A PMC0
+UMASK_PM_PREF_TRACK_EFF 0x00
+
+EVENT_PM_PREF_TRACK_INEFF 0x5A PMC2
+UMASK_PM_PREF_TRACK_INEFF 0x00
+
+EVENT_PM_PREF_TRACK_MOD 0x5A PMC3
+UMASK_PM_PREF_TRACK_MOD 0x00
+
+EVENT_PM_PREF_TRACK_MOD_L2 0x5C PMC0
+UMASK_PM_PREF_TRACK_MOD_L2 0x00
+
+EVENT_PM_PREF_TRACK_MOD_L3 0x5C PMC2
+UMASK_PM_PREF_TRACK_MOD_L3 0x00
+
+EVENT_PM_PROBE_NOP_DISP 0x14 PMC3
+UMASK_PM_PROBE_NOP_DISP 0x00
+
+EVENT_PM_PTE_PREFETCH 0x84 PMC
+UMASK_PM_PTE_PREFETCH 0x0E
+
+EVENT_PM_PUMP_CPRED 0x54 PMC0
+UMASK_PM_PUMP_CPRED 0x00
+
+EVENT_PM_PUMP_MPRED 0x52 PMC3
+UMASK_PM_PUMP_MPRED 0x00
+
+EVENT_PM_RC0_ALLOC 0x81 PMC0
+UMASK_PM_RC0_ALLOC 0x06
+
+EVENT_PM_RC0_BUSY 0x80 PMC0
+UMASK_PM_RC0_BUSY 0x06
+
+EVENT_PM_RC_LIFETIME_EXC_1024 0xEA PMC2
+UMASK_PM_RC_LIFETIME_EXC_1024 0x00 0x60 0x01
+
+EVENT_PM_RC_LIFETIME_EXC_2048 0xEC PMC3
+UMASK_PM_RC_LIFETIME_EXC_2048 0x00 0x60 0x01
+
+EVENT_PM_RC_LIFETIME_EXC_256 0xE8 PMC0
+UMASK_PM_RC_LIFETIME_EXC_256 0x00 0x60 0x01
+
+EVENT_PM_RC_LIFETIME_EXC_32 0xE6 PMC1
+UMASK_PM_RC_LIFETIME_EXC_32 0x00 0x60 0x01
+
+EVENT_PM_RC_USAGE 0x88 PMC2
+UMASK_PM_RC_USAGE 0x06
+
+EVENT_PM_REAL_SRQ_FULL 0x04 PMC1
+UMASK_PM_REAL_SRQ_FULL 0x00
+
+EVENT_PM_RUN_CYC 0xF4 PMC5|PMC1
+UMASK_PM_RUN_CYC 0x00
+
+EVENT_PM_RUN_CYC_SMT2_MODE 0x6C PMC2
+UMASK_PM_RUN_CYC_SMT2_MODE 0x00
+
+EVENT_PM_RUN_CYC_SMT2_SHRD_MODE 0x6A PMC1
+UMASK_PM_RUN_CYC_SMT2_SHRD_MODE 0x00
+
+EVENT_PM_RUN_CYC_SMT2_SPLIT_MODE 0x6A PMC0
+UMASK_PM_RUN_CYC_SMT2_SPLIT_MODE 0x00
+
+EVENT_PM_RUN_CYC_SMT4_MODE 0x6C PMC1
+UMASK_PM_RUN_CYC_SMT4_MODE 0x00
+
+EVENT_PM_RUN_CYC_SMT8_MODE 0x6C PMC3
+UMASK_PM_RUN_CYC_SMT8_MODE 0x00
+
+EVENT_PM_RUN_CYC_ST_MODE 0x6C PMC0
+UMASK_PM_RUN_CYC_ST_MODE 0x00
+
+EVENT_PM_RUN_INST_CMPL 0xFA PMC4
+UMASK_PM_RUN_INST_CMPL 0x00
+
+EVENT_PM_RUN_INST_CMPL_ALT 0xFA PMC3
+UMASK_PM_RUN_INST_CMPL_ALT 0x00
+
+EVENT_PM_RUN_PURR 0xF4 PMC3
+UMASK_PM_RUN_PURR 0x00
+
+EVENT_PM_RUN_SPURR 0x08 PMC0
+UMASK_PM_RUN_SPURR 0x00
+
+EVENT_PM_SEC_ERAT_HIT 0x82 PMC
+UMASK_PM_SEC_ERAT_HIT 0x0F
+
+EVENT_PM_SHL_CREATED 0x8C PMC
+UMASK_PM_SHL_CREATED 0x05
+
+EVENT_PM_SHL_ST_CONVERT 0x8E PMC
+UMASK_PM_SHL_ST_CONVERT 0x05
+
+EVENT_PM_SHL_ST_DISABLE 0x90 PMC
+UMASK_PM_SHL_ST_DISABLE 0x05
+
+EVENT_PM_SN0_ALLOC 0x85 PMC1
+UMASK_PM_SN0_ALLOC 0x06
+
+EVENT_PM_SN0_BUSY 0x84 PMC1
+UMASK_PM_SN0_BUSY 0x06
+
+EVENT_PM_SNOOP_TLBIE 0xB2 PMC
+UMASK_PM_SNOOP_TLBIE 0x0D
+
+EVENT_PM_SN_USAGE 0x8C PMC3
+UMASK_PM_SN_USAGE 0x06
+
+EVENT_PM_STALL_END_GCT_EMPTY 0x28 PMC0
+UMASK_PM_STALL_END_GCT_EMPTY 0x00
+
+EVENT_PM_STCX_FAIL 0x58 PMC0
+UMASK_PM_STCX_FAIL 0x0E
+
+EVENT_PM_STCX_LSU 0x90 PMC
+UMASK_PM_STCX_LSU 0x0C
+
+EVENT_PM_ST_CMPL 0x16 PMC1
+UMASK_PM_ST_CMPL 0x00
+
+EVENT_PM_ST_FIN 0xF0 PMC1
+UMASK_PM_ST_FIN 0x00
+
+EVENT_PM_ST_FWD 0x18 PMC1
+UMASK_PM_ST_FWD 0x00
+
+EVENT_PM_ST_MISS_L1 0xF0 PMC2
+UMASK_PM_ST_MISS_L1 0x00
+
+EVENT_PM_SUSPENDED 0x00 PMC
+UMASK_PM_SUSPENDED 0x00
+
+EVENT_PM_SWAP_CANCEL 0x90 PMC
+UMASK_PM_SWAP_CANCEL 0x03
+
+EVENT_PM_SWAP_CANCEL_GPR 0x92 PMC
+UMASK_PM_SWAP_CANCEL_GPR 0x03
+
+EVENT_PM_SWAP_COMPLETE 0x8C PMC
+UMASK_PM_SWAP_COMPLETE 0x03
+
+EVENT_PM_SWAP_COMPLETE_GPR 0x8E PMC
+UMASK_PM_SWAP_COMPLETE_GPR 0x03
+
+EVENT_PM_SYNC_MRK_BR_LINK 0x52 PMC0
+UMASK_PM_SYNC_MRK_BR_LINK 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_BR_MPRED 0x5C PMC0
+UMASK_PM_SYNC_MRK_BR_MPRED 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_FX_DIVIDE 0x56 PMC0
+UMASK_PM_SYNC_MRK_FX_DIVIDE 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_L2HIT 0x58 PMC0
+UMASK_PM_SYNC_MRK_L2HIT 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_L2MISS 0x5A PMC0
+UMASK_PM_SYNC_MRK_L2MISS 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_L3MISS 0x54 PMC0
+UMASK_PM_SYNC_MRK_L3MISS 0x05 0x20 0x00
+
+EVENT_PM_SYNC_MRK_PROBE_NOP 0x50 PMC0
+UMASK_PM_SYNC_MRK_PROBE_NOP 0x05 0x20 0x00
+
+EVENT_PM_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_SYS_PUMP_CPRED 0x00
+
+EVENT_PM_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_SYS_PUMP_MPRED 0x00
+
+EVENT_PM_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_SYS_PUMP_MPRED_RTY 0x00
+
+EVENT_PM_TABLEWALK_CYC 0x26 PMC0
+UMASK_PM_TABLEWALK_CYC 0x00
+
+EVENT_PM_TABLEWALK_CYC_PREF 0x86 PMC
+UMASK_PM_TABLEWALK_CYC_PREF 0x0E
+
+EVENT_PM_TABORT_TRECLAIM 0xB2 PMC
+UMASK_PM_TABORT_TRECLAIM 0x02
+
+EVENT_PM_TB_BIT_TRANS 0xF8 PMC2
+UMASK_PM_TB_BIT_TRANS 0x00
+
+EVENT_PM_TEND_PEND_CYC 0xBA PMC
+UMASK_PM_TEND_PEND_CYC 0x0E
+
+EVENT_PM_THRD_ALL_RUN_CYC 0x0C PMC1
+UMASK_PM_THRD_ALL_RUN_CYC 0x00
+
+EVENT_PM_THRD_CONC_RUN_INST 0xF4 PMC2
+UMASK_PM_THRD_CONC_RUN_INST 0x00
+
+EVENT_PM_THRD_GRP_CMPL_BOTH_CYC 0x12 PMC0
+UMASK_PM_THRD_GRP_CMPL_BOTH_CYC 0x00
+
+EVENT_PM_THRD_PRIO_0_1_CYC 0xBC PMC
+UMASK_PM_THRD_PRIO_0_1_CYC 0x04
+
+EVENT_PM_THRD_PRIO_2_3_CYC 0xBE PMC
+UMASK_PM_THRD_PRIO_2_3_CYC 0x04
+
+EVENT_PM_THRD_PRIO_4_5_CYC 0x80 PMC
+UMASK_PM_THRD_PRIO_4_5_CYC 0x05
+
+EVENT_PM_THRD_PRIO_6_7_CYC 0x82 PMC
+UMASK_PM_THRD_PRIO_6_7_CYC 0x05
+
+EVENT_PM_THRD_REBAL_CYC 0x98 PMC
+UMASK_PM_THRD_REBAL_CYC 0x03
+
+EVENT_PM_THRESH_EXC_1024 0xEA PMC2
+UMASK_PM_THRESH_EXC_1024 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_128 0xEA PMC3
+UMASK_PM_THRESH_EXC_128 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_2048 0xEC PMC3
+UMASK_PM_THRESH_EXC_2048 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_256 0xE8 PMC0
+UMASK_PM_THRESH_EXC_256 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_32 0xE6 PMC1
+UMASK_PM_THRESH_EXC_32 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_4096 0xE6 PMC0
+UMASK_PM_THRESH_EXC_4096 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_512 0xE8 PMC1
+UMASK_PM_THRESH_EXC_512 0x00 0x20 0x00
+
+EVENT_PM_THRESH_EXC_64 0xE8 PMC2
+UMASK_PM_THRESH_EXC_64 0x00 0x20 0x00
+
+EVENT_PM_THRESH_MET 0xEC PMC0
+UMASK_PM_THRESH_MET 0x00 0x20 0x00
+
+EVENT_PM_THRESH_NOT_MET 0x6E PMC3
+UMASK_PM_THRESH_NOT_MET 0x00 0x20 0x00
+
+EVENT_PM_TLBIE_FIN 0x58 PMC2
+UMASK_PM_TLBIE_FIN 0x00
+
+EVENT_PM_TLB_MISS 0x66 PMC1
+UMASK_PM_TLB_MISS 0x00
+
+EVENT_PM_TM_BEGIN_ALL 0xB8 PMC
+UMASK_PM_TM_BEGIN_ALL 0x02
+
+EVENT_PM_TM_END_ALL 0xBA PMC
+UMASK_PM_TM_END_ALL 0x02
+
+EVENT_PM_TM_FAIL_CONF_NON_TM 0x86 PMC
+UMASK_PM_TM_FAIL_CONF_NON_TM 0x03
+
+EVENT_PM_TM_FAIL_CON_TM 0x88 PMC
+UMASK_PM_TM_FAIL_CON_TM 0x03
+
+EVENT_PM_TM_FAIL_DISALLOW 0xB2 PMC
+UMASK_PM_TM_FAIL_DISALLOW 0x0E
+
+EVENT_PM_TM_FAIL_FOOTPRINT_OVERFLOW 0x84 PMC
+UMASK_PM_TM_FAIL_FOOTPRINT_OVERFLOW 0x03
+
+EVENT_PM_TM_FAIL_NON_TX_CONFLICT 0xB8 PMC
+UMASK_PM_TM_FAIL_NON_TX_CONFLICT 0x0E
+
+EVENT_PM_TM_FAIL_SELF 0x8A PMC
+UMASK_PM_TM_FAIL_SELF 0x03
+
+EVENT_PM_TM_FAIL_TLBIE 0xB4 PMC
+UMASK_PM_TM_FAIL_TLBIE 0x0E
+
+EVENT_PM_TM_FAIL_TX_CONFLICT 0xB6 PMC
+UMASK_PM_TM_FAIL_TX_CONFLICT 0x0E
+
+EVENT_PM_TM_TBEGIN 0xBC PMC
+UMASK_PM_TM_TBEGIN 0x02
+
+EVENT_PM_TM_TRANS_RUN_CYC 0x60 PMC0
+UMASK_PM_TM_TRANS_RUN_CYC 0x00
+
+EVENT_PM_TM_TRANS_RUN_INST 0x60 PMC2
+UMASK_PM_TM_TRANS_RUN_INST 0x00
+
+EVENT_PM_TM_TRESUME 0x80 PMC
+UMASK_PM_TM_TRESUME 0x03
+
+EVENT_PM_TM_TSUSPEND 0xBE PMC
+UMASK_PM_TM_TSUSPEND 0x02
+
+EVENT_PM_TM_TX_PASS_RUN_CYC 0x12 PMC1
+UMASK_PM_TM_TX_PASS_RUN_CYC 0x0E
+
+EVENT_PM_TM_TX_PASS_RUN_INST 0x14 PMC3
+UMASK_PM_TM_TX_PASS_RUN_INST 0x0E
+
+EVENT_PM_UP_PREF_L3 0x8C PMC
+UMASK_PM_UP_PREF_L3 0x0E
+
+EVENT_PM_UP_PREF_POINTER 0x8E PMC
+UMASK_PM_UP_PREF_POINTER 0x0E
+
+EVENT_PM_VSU0_16FLOP 0xA4 PMC
+UMASK_PM_VSU0_16FLOP 0x0A
+
+EVENT_PM_VSU0_1FLOP 0x80 PMC
+UMASK_PM_VSU0_1FLOP 0x0A
+
+EVENT_PM_VSU0_2FLOP 0x98 PMC
+UMASK_PM_VSU0_2FLOP 0x0A
+
+EVENT_PM_VSU0_4FLOP 0x9C PMC
+UMASK_PM_VSU0_4FLOP 0x0A
+
+EVENT_PM_VSU0_8FLOP 0xA0 PMC
+UMASK_PM_VSU0_8FLOP 0x0A
+
+EVENT_PM_VSU0_COMPLEX_ISSUED 0xA4 PMC
+UMASK_PM_VSU0_COMPLEX_ISSUED 0x0B
+
+EVENT_PM_VSU0_CY_ISSUED 0xB4 PMC
+UMASK_PM_VSU0_CY_ISSUED 0x0B
+
+EVENT_PM_VSU0_DD_ISSUED 0xA8 PMC
+UMASK_PM_VSU0_DD_ISSUED 0x0B
+
+EVENT_PM_VSU0_DP_2FLOP 0x8C PMC
+UMASK_PM_VSU0_DP_2FLOP 0x0A
+
+EVENT_PM_VSU0_DP_FMA 0x90 PMC
+UMASK_PM_VSU0_DP_FMA 0x0A
+
+EVENT_PM_VSU0_DP_FSQRT_FDIV 0x94 PMC
+UMASK_PM_VSU0_DP_FSQRT_FDIV 0x0A
+
+EVENT_PM_VSU0_DQ_ISSUED 0xAC PMC
+UMASK_PM_VSU0_DQ_ISSUED 0x0B
+
+EVENT_PM_VSU0_EX_ISSUED 0xB0 PMC
+UMASK_PM_VSU0_EX_ISSUED 0x0B
+
+EVENT_PM_VSU0_FIN 0xBC PMC
+UMASK_PM_VSU0_FIN 0x0A
+
+EVENT_PM_VSU0_FMA 0x84 PMC
+UMASK_PM_VSU0_FMA 0x0A
+
+EVENT_PM_VSU0_FPSCR 0x98 PMC
+UMASK_PM_VSU0_FPSCR 0x0B
+
+EVENT_PM_VSU0_FSQRT_FDIV 0x88 PMC
+UMASK_PM_VSU0_FSQRT_FDIV 0x0A
+
+EVENT_PM_VSU0_PERMUTE_ISSUED 0x90 PMC
+UMASK_PM_VSU0_PERMUTE_ISSUED 0x0B
+
+EVENT_PM_VSU0_SCALAR_DP_ISSUED 0x88 PMC
+UMASK_PM_VSU0_SCALAR_DP_ISSUED 0x0B
+
+EVENT_PM_VSU0_SIMPLE_ISSUED 0x94 PMC
+UMASK_PM_VSU0_SIMPLE_ISSUED 0x0B
+
+EVENT_PM_VSU0_SINGLE 0xA8 PMC
+UMASK_PM_VSU0_SINGLE 0x0A
+
+EVENT_PM_VSU0_SQ 0x9C PMC
+UMASK_PM_VSU0_SQ 0x0B
+
+EVENT_PM_VSU0_STF 0x8C PMC
+UMASK_PM_VSU0_STF 0x0B
+
+EVENT_PM_VSU0_VECTOR_DP_ISSUED 0x80 PMC
+UMASK_PM_VSU0_VECTOR_DP_ISSUED 0x0B
+
+EVENT_PM_VSU0_VECTOR_SP_ISSUED 0x84 PMC
+UMASK_PM_VSU0_VECTOR_SP_ISSUED 0x0B
+
+EVENT_PM_VSU1_16FLOP 0xA6 PMC
+UMASK_PM_VSU1_16FLOP 0x0A
+
+EVENT_PM_VSU1_1FLOP 0x82 PMC
+UMASK_PM_VSU1_1FLOP 0x0A
+
+EVENT_PM_VSU1_2FLOP 0x9A PMC
+UMASK_PM_VSU1_2FLOP 0x0A
+
+EVENT_PM_VSU1_4FLOP 0x9E PMC
+UMASK_PM_VSU1_4FLOP 0x0A
+
+EVENT_PM_VSU1_8FLOP 0xA2 PMC
+UMASK_PM_VSU1_8FLOP 0x0A
+
+EVENT_PM_VSU1_COMPLEX_ISSUED 0xA6 PMC
+UMASK_PM_VSU1_COMPLEX_ISSUED 0x0B
+
+EVENT_PM_VSU1_CY_ISSUED 0xB6 PMC
+UMASK_PM_VSU1_CY_ISSUED 0x0B
+
+EVENT_PM_VSU1_DD_ISSUED 0xAA PMC
+UMASK_PM_VSU1_DD_ISSUED 0x0B
+
+EVENT_PM_VSU1_DP_2FLOP 0x8E PMC
+UMASK_PM_VSU1_DP_2FLOP 0x0A
+
+EVENT_PM_VSU1_DP_FMA 0x92 PMC
+UMASK_PM_VSU1_DP_FMA 0x0A
+
+EVENT_PM_VSU1_DP_FSQRT_FDIV 0x96 PMC
+UMASK_PM_VSU1_DP_FSQRT_FDIV 0x0A
+
+EVENT_PM_VSU1_DQ_ISSUED 0xAE PMC
+UMASK_PM_VSU1_DQ_ISSUED 0x0B
+
+EVENT_PM_VSU1_EX_ISSUED 0xB2 PMC
+UMASK_PM_VSU1_EX_ISSUED 0x0B
+
+EVENT_PM_VSU1_FIN 0xBE PMC
+UMASK_PM_VSU1_FIN 0x0A
+
+EVENT_PM_VSU1_FMA 0x86 PMC
+UMASK_PM_VSU1_FMA 0x0A
+
+EVENT_PM_VSU1_FPSCR 0x9A PMC
+UMASK_PM_VSU1_FPSCR 0x0B
+
+EVENT_PM_VSU1_FSQRT_FDIV 0x8A PMC
+UMASK_PM_VSU1_FSQRT_FDIV 0x0A
+
+EVENT_PM_VSU1_PERMUTE_ISSUED 0x92 PMC
+UMASK_PM_VSU1_PERMUTE_ISSUED 0x0B
+
+EVENT_PM_VSU1_SCALAR_DP_ISSUED 0x8A PMC
+UMASK_PM_VSU1_SCALAR_DP_ISSUED 0x0B
+
+EVENT_PM_VSU1_SIMPLE_ISSUED 0x96 PMC
+UMASK_PM_VSU1_SIMPLE_ISSUED 0x0B
+
+EVENT_PM_VSU1_SINGLE 0xAA PMC
+UMASK_PM_VSU1_SINGLE 0x0A
+
+EVENT_PM_VSU1_SQ 0x9E PMC
+UMASK_PM_VSU1_SQ 0x0B
+
+EVENT_PM_VSU1_STF 0x8E PMC
+UMASK_PM_VSU1_STF 0x0B
+
+EVENT_PM_VSU1_VECTOR_DP_ISSUED 0x82 PMC
+UMASK_PM_VSU1_VECTOR_DP_ISSUED 0x0B
+
+EVENT_PM_VSU1_VECTOR_SP_ISSUED 0x86 PMC
+UMASK_PM_VSU1_VECTOR_SP_ISSUED 0x0B
+
+EVENT_PM_L3_L2_CO_HIT 0x88 PMC2
+UMASK_PM_L3_L2_CO_HIT 0x08 0x01 0x00
+
+EVENT_PM_L3_L2_CO_MISS 0x8A PMC2
+UMASK_PM_L3_L2_CO_MISS 0x08 0x01 0x00
+
+EVENT_PM_L2_CASTOUT_MOD 0x80 PMC0
+UMASK_PM_L2_CASTOUT_MOD 0x07 0x04 0x00
+
+EVENT_PM_L2_CASTOUT_SHR 0x82 PMC0
+UMASK_PM_L2_CASTOUT_SHR 0x07 0x04 0x00
+
+EVENT_PURR_STATE 0x00 PURR
+UMASK_PURR_STATE 0x00
+
+EVENT_SPURR_STATE 0x01 SPURR
+UMASK_SPURR_STATE 0x00
+
+EVENT_PM_L3_P0_CO_MEM 0x88 PMC2
+UMASK_PM_L3_P0_CO_MEM 0x08 0x05 0x00
diff --git a/src/includes/perfmon_power9.h b/src/includes/perfmon_power9.h
new file mode 100644
index 000000000..b218c1ce2
--- /dev/null
+++ b/src/includes/perfmon_power9.h
@@ -0,0 +1,12 @@
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+#include <perfmon_power9_counters.h>
+#include <perfmon_power9_events.h>
+
+static int perfmon_numCountersPower9 = NUM_COUNTERS_POWER9;
+static int perfmon_numCoreCountersPower9 = NUM_COUNTERS_POWER9;
+static int perfmon_numArchEventsPower9 = NUM_ARCH_EVENTS_POWER9;
+
diff --git a/src/includes/perfmon_power9_counters.h b/src/includes/perfmon_power9_counters.h
new file mode 100644
index 000000000..4a6053014
--- /dev/null
+++ b/src/includes/perfmon_power9_counters.h
@@ -0,0 +1,90 @@
+
+
+#define NUM_COUNTERS_POWER9 48
+
+static RegisterMap power9_counter_map[NUM_COUNTERS_POWER9] = {
+    {"PMC0", PMC0, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC1", PMC1, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC2", PMC2, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC3", PMC3, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC4", PMC4, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PMC5", PMC5, PMC, 0x0, 0x0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"MBOX0C0", PMC6, MBOX0},
+    {"MBOX0C1", PMC7, MBOX0},
+    {"MBOX0C2", PMC8, MBOX0},
+    {"MBOX1C0", PMC9, MBOX1},
+    {"MBOX1C1", PMC10, MBOX1},
+    {"MBOX1C2", PMC11, MBOX1},
+    {"MBOX2C0", PMC12, MBOX2},
+    {"MBOX2C1", PMC13, MBOX2},
+    {"MBOX2C2", PMC14, MBOX2},
+    {"MBOX3C0", PMC15, MBOX3},
+    {"MBOX3C1", PMC16, MBOX3},
+    {"MBOX3C2", PMC17, MBOX3},
+    {"MBOX4C0", PMC18, MBOX4},
+    {"MBOX4C1", PMC19, MBOX4},
+    {"MBOX4C2", PMC20, MBOX4},
+    {"MBOX5C0", PMC21, MBOX5},
+    {"MBOX5C1", PMC22, MBOX5},
+    {"MBOX5C2", PMC23, MBOX5},
+    {"MBOX6C0", PMC24, MBOX6},
+    {"MBOX6C1", PMC25, MBOX6},
+    {"MBOX6C2", PMC26, MBOX6},
+    {"MBOX7C0", PMC27, MBOX7},
+    {"MBOX7C1", PMC28, MBOX7},
+    {"MBOX7C2", PMC29, MBOX7},
+    {"QBOX0C0", PMC30, QBOX0},
+    {"QBOX0C1", PMC31, QBOX0},
+    {"QBOX0C2", PMC32, QBOX0},
+    {"QBOX1C0", PMC33, QBOX1},
+    {"QBOX1C1", PMC34, QBOX1},
+    {"QBOX1C2", PMC35, QBOX1},
+    {"QBOX2C0", PMC36, QBOX2},
+    {"QBOX2C1", PMC37, QBOX2},
+    {"QBOX2C2", PMC38, QBOX2},
+    {"SBOX0C0", PMC39, SBOX0},
+    {"SBOX0C1", PMC40, SBOX0},
+    {"SBOX0C2", PMC41, SBOX0},
+    {"BBOX0C0", PMC42, BBOX0},
+    {"BBOX0C1", PMC43, BBOX0},
+    {"BBOX0C2", PMC44, BBOX0},
+    {"BBOX1C0", PMC45, BBOX1},
+    {"BBOX1C1", PMC46, BBOX1},
+    {"BBOX1C2", PMC47, BBOX1},
+};
+
+static BoxMap power9_box_map[NUM_UNITS] = {
+    [PMC] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX0] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX1] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX2] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX3] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX4] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX5] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX6] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [MBOX7] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [QBOX0] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [QBOX1] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [QBOX2] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [SBOX0] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [BBOX0] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+    [BBOX1] = {0x0, 0x0, 0x0, 0, 0, 0, 64},
+};
+
+static char* power9_translate_types[NUM_UNITS] = {
+    [PMC] = "/sys/bus/event_source/devices/cpu",
+    [MBOX0] = "/sys/bus/event_source/devices/nest_mba0_imc",
+    [MBOX1] = "/sys/bus/event_source/devices/nest_mba1_imc",
+    [MBOX2] = "/sys/bus/event_source/devices/nest_mba2_imc",
+    [MBOX3] = "/sys/bus/event_source/devices/nest_mba3_imc",
+    [MBOX4] = "/sys/bus/event_source/devices/nest_mba4_imc",
+    [MBOX5] = "/sys/bus/event_source/devices/nest_mba5_imc",
+    [MBOX6] = "/sys/bus/event_source/devices/nest_mba6_imc",
+    [MBOX7] = "/sys/bus/event_source/devices/nest_mba7_imc",
+    [QBOX0] = "/sys/bus/event_source/devices/nest_xlink0_imc",
+    [QBOX1] = "/sys/bus/event_source/devices/nest_xlink1_imc",
+    [QBOX2] = "/sys/bus/event_source/devices/nest_xlink2_imc",
+    [SBOX0] = "/sys/bus/event_source/devices/nest_powerbus0_imc",
+    [BBOX0] = "/sys/bus/event_source/devices/nest_mcs01_imc",
+    [BBOX1] = "/sys/bus/event_source/devices/nest_mcs23_imc",
+};
diff --git a/src/includes/perfmon_power9_events.txt b/src/includes/perfmon_power9_events.txt
new file mode 100644
index 000000000..c72888269
--- /dev/null
+++ b/src/includes/perfmon_power9_events.txt
@@ -0,0 +1,2549 @@
+EVENT_PM_RUN_CYC 0xF4 PMC5
+UMASK_PM_RUN_CYC 0x00
+
+EVENT_PM_RUN_INST_CMPL 0xFA PMC4
+UMASK_PM_RUN_INST_CMPL 0x00
+
+
+#################
+# IFU Events
+#################
+
+
+EVENT_PM_IC_DEMAND_CYC 0x18 PMC0
+UMASK_PM_IC_DEMAND_CYC 0x00
+
+EVENT_PM_INST_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_INST_FROM_L2_NO_CONFLICT 0x40
+
+EVENT_PM_INST_FROM_L2 0x42 PMC0
+UMASK_PM_INST_FROM_L2 0x40
+
+EVENT_PM_INST_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_INST_FROM_L3_NO_CONFLICT 0x40
+
+EVENT_PM_INST_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_INST_FROM_L31_SHR 0x40
+
+EVENT_PM_INST_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_INST_FROM_ON_CHIP_CACHE 0x40
+
+EVENT_PM_INST_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_INST_FROM_RL2L3_SHR 0x40
+
+EVENT_PM_INST_FROM_LL4 0x4C PMC0
+UMASK_PM_INST_FROM_LL4 0x40
+
+EVENT_PM_INST_FROM_L2MISS 0x4E PMC0
+UMASK_PM_INST_FROM_L2MISS 0x40
+
+EVENT_PM_INST_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_INST_CHIP_PUMP_CPRED 0x40
+
+EVENT_PM_INST_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_INST_GRP_PUMP_MPRED_RTY 0x40
+
+EVENT_PM_INST_PUMP_CPRED 0x54 PMC0
+UMASK_PM_INST_PUMP_CPRED 0x40
+
+EVENT_PM_INST_CMPL 0xFE PMC0
+UMASK_PM_INST_CMPL 0x00
+
+EVENT_PM_INST_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_INST_FROM_L2_MEPF 0x40
+
+EVENT_PM_INST_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_INST_FROM_L3_MEPF 0x40
+
+EVENT_PM_INST_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_INST_FROM_L31_MOD 0x40
+
+EVENT_PM_INST_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_INST_FROM_RL2L3_MOD 0x40
+
+EVENT_PM_INST_FROM_LMEM 0x48 PMC1
+UMASK_PM_INST_FROM_LMEM 0x40
+
+EVENT_PM_INST_FROM_RL4 0x4A PMC1
+UMASK_PM_INST_FROM_RL4 0x40
+
+EVENT_PM_INST_FROM_MEMORY 0x4C PMC1
+UMASK_PM_INST_FROM_MEMORY 0x40
+
+EVENT_PM_INST_GRP_PUMP_CPRED 0x5C PMC1
+UMASK_PM_INST_GRP_PUMP_CPRED 0xC0
+
+EVENT_PM_INST_GRP_PUMP_MPRED 0x5E PMC1
+UMASK_PM_INST_GRP_PUMP_MPRED 0xC0
+
+EVENT_PM_INST_DISP 0xF2 PMC1
+UMASK_PM_INST_DISP 0x00
+
+EVENT_PM_L1_ICACHE_MISS 0xFD PMC1
+UMASK_PM_L1_ICACHE_MISS 0x00
+
+EVENT_PM_INST_CMPL 0x02 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_INST_CMPL 0x00
+
+EVENT_PM_INST_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_INST_FROM_L2_DISP_CONFLICT_LDHITST 0x40
+
+EVENT_PM_INST_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_INST_FROM_L3_DISP_CONFLICT 0x40
+
+EVENT_PM_INST_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_INST_FROM_L31_ECO_SHR 0x40
+
+EVENT_PM_INST_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_INST_FROM_L21_SHR 0x40
+
+EVENT_PM_INST_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_INST_FROM_DL2L3_SHR 0x40
+
+EVENT_PM_INST_FROM_RMEM 0x4A PMC2
+UMASK_PM_INST_FROM_RMEM 0x40
+
+EVENT_PM_INST_FROM_DL4 0x4C PMC2
+UMASK_PM_INST_FROM_DL4 0x40
+
+EVENT_PM_INST_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_INST_SYS_PUMP_CPRED 0x40
+
+EVENT_PM_INST_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_INST_SYS_PUMP_MPRED 0x40
+
+EVENT_PM_L1_ICACHE_RELOADED_PREF 0x68 PMC2
+UMASK_PM_L1_ICACHE_RELOADED_PREF 0x00
+
+EVENT_PM_INST_DISP 0xF2 PMC2
+UMASK_PM_INST_DISP 0x00
+
+EVENT_PM_THRD_CONC_RUN_INST 0xF4 PMC2
+UMASK_PM_THRD_CONC_RUN_INST 0x00
+
+EVENT_PM_INST_FROM_L3MISS 0xFA PMC2
+UMASK_PM_INST_FROM_L3MISS 0x00
+
+EVENT_PM_L1_ICACHE_RELOADED_ALL 0x12 PMC3
+UMASK_PM_L1_ICACHE_RELOADED_ALL 0x00
+
+EVENT_PM_INST_IMC_MATCH_CMPL 0x1C PMC3
+UMASK_PM_INST_IMC_MATCH_CMPL 0x00
+
+EVENT_PM_INST_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_INST_FROM_L2_DISP_CONFLICT_OTHER 0x40
+
+EVENT_PM_INST_FROM_L3 0x42 PMC3
+UMASK_PM_INST_FROM_L3 0x40
+
+EVENT_PM_INST_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_INST_FROM_L31_ECO_MOD 0x40
+
+EVENT_PM_INST_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_INST_FROM_L21_MOD 0x40
+
+EVENT_PM_INST_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_INST_FROM_DL2L3_MOD 0x40
+
+EVENT_PM_INST_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_INST_FROM_OFF_CHIP_CACHE 0x40
+
+EVENT_PM_INST_FROM_DMEM 0x4C PMC3
+UMASK_PM_INST_FROM_DMEM 0x40
+
+EVENT_PM_INST_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_INST_FROM_L3MISS_MOD 0x40
+
+EVENT_PM_INST_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_INST_SYS_PUMP_MPRED_RTY 0x40
+
+EVENT_PM_INST_PUMP_MPRED 0x52 PMC3
+UMASK_PM_INST_PUMP_MPRED 0x40
+
+EVENT_PM_RUN_INST_CMPL 0xFA PMC3
+UMASK_PM_RUN_INST_CMPL 0x00
+
+EVENT_PM_INST_FROM_L1 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_INST_FROM_L1 0x40
+
+EVENT_PM_BANK_CONFLICT 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BANK_CONFLICT 0x48
+
+EVENT_PM_EAT_FULL_CYC 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_EAT_FULL_CYC 0x40
+
+EVENT_PM_IBUF_FULL_CYC 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IBUF_FULL_CYC 0x48
+
+EVENT_PM_IC_DEMAND_REQ 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_DEMAND_REQ 0x40
+
+EVENT_PM_IC_PREF_REQ 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_PREF_REQ 0x48
+
+EVENT_PM_L1_DEMAND_WRITE 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_L1_DEMAND_WRITE 0x40
+
+EVENT_PM_IC_PREF_WRITE 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_PREF_WRITE 0x48
+
+EVENT_PM_IC_PREF_CANCEL_PAGE 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_PREF_CANCEL_PAGE 0x40
+
+EVENT_PM_IC_PREF_CANCEL_HIT 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_PREF_CANCEL_HIT 0x48
+
+EVENT_PM_IC_PREF_CANCEL_L2 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_PREF_CANCEL_L2 0x40
+
+EVENT_PM_IC_RELOAD_PRIVATE 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_RELOAD_PRIVATE 0x48
+
+EVENT_PM_IC_DEMAND_L2_BHT_REDIRECT 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_DEMAND_L2_BHT_REDIRECT 0x40
+
+EVENT_PM_IC_DEMAND_L2_BR_REDIRECT 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_DEMAND_L2_BR_REDIRECT 0x48
+
+EVENT_PM_DECODE_FUSION_LD_ST_DISP 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DECODE_FUSION_LD_ST_DISP 0x48
+
+EVENT_PM_THRD_PRIO_0_1_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_THRD_PRIO_0_1_CYC 0x40
+
+EVENT_PM_THRD_PRIO_2_3_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_THRD_PRIO_2_3_CYC 0x48
+
+EVENT_PM_THRD_PRIO_4_5_CYC 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_THRD_PRIO_4_5_CYC 0x50
+
+EVENT_PM_THRD_PRIO_6_7_CYC 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_THRD_PRIO_6_7_CYC 0x58
+
+EVENT_PM_IC_INVALIDATE 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_INVALIDATE 0x50
+
+EVENT_PM_SHL_CREATED 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_SHL_CREATED 0x50
+
+EVENT_PM_SHL_ST_DEP_CREATED 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_SHL_ST_DEP_CREATED 0x58
+
+EVENT_PM_IC_MISS_ICBI 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_IC_MISS_ICBI 0x50
+
+EVENT_PM_LWSYNC 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LWSYNC 0x58
+
+EVENT_PM_PTESYNC 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_PTESYNC 0x58
+
+EVENT_PM_HWSYNC 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_HWSYNC 0x50
+
+EVENT_PM_FLUSH_LSU 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_LSU 0x58
+
+EVENT_PM_DECODE_HOLD_ICT_FULL 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DECODE_HOLD_ICT_FULL 0x58
+
+#################
+# Branch Events
+#################
+
+EVENT_PM_BRU_FIN 0x68 PMC0
+UMASK_PM_BRU_FIN 0x00
+
+EVENT_PM_BR_2PATH 0x36 PMC1|PMC3
+UMASK_PM_BR_2PATH 0x00
+
+EVENT_PM_TAKEN_BR_MPRED_CMPL 0x56 PMC1
+UMASK_PM_TAKEN_BR_MPRED_CMPL 0x00
+
+EVENT_PM_BACK_BR_CMPL 0x5E PMC1
+UMASK_PM_BACK_BR_CMPL 0x50
+
+EVENT_PM_BR_TAKEN_CMPL 0xFA PMC1
+UMASK_PM_BR_TAKEN_CMPL 0x00
+
+EVENT_PM_BFU_BUSY 0x5C PMC2
+UMASK_PM_BFU_BUSY 0x00
+
+EVENT_PM_BR_CMPL 0x5E PMC3
+UMASK_PM_BR_CMPL 0xD0
+
+EVENT_PM_BR_MPRED_CMPL 0xF6 PMC3
+UMASK_PM_BR_MPRED_CMPL 0x00
+
+EVENT_PM_BR_PRED 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED  0x40
+
+EVENT_PM_BR_CORECT_PRED_TAKEN_CMPL 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_CORECT_PRED_TAKEN_CMPL 0x48
+
+EVENT_PM_BR_UNCOND 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_UNCOND 0x40
+
+EVENT_PM_BR_PRED_PCACHE 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED_PCACHE 0x48
+
+EVENT_PM_BR_PRED_CCACHE 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED_CCACHE 0x40
+
+EVENT_PM_STOP_FETCH_PENDING_CYC 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_STOP_FETCH_PENDING_CYC 0x48
+
+EVENT_PM_BR_PRED_LSTACK 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED_LSTACK 0x40
+
+EVENT_PM_BR_MPRED_CCACHE 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_MPRED_CCACHE 0x40
+
+EVENT_PM_BR_MPRED_LSTACK 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_MPRED_LSTACK 0x48
+
+EVENT_PM_BR_PRED_TAKEN_CR 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED_TAKEN_CR 0x40
+
+EVENT_PM_BR_MPRED_PCACHE 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_MPRED_PCACHE 0x48
+
+EVENT_PM_BR_PRED_TA 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_PRED_TA 0x40
+
+EVENT_PM_DECODE_FUSION_CONST_GEN 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DECODE_FUSION_CONST_GEN 0x48
+
+EVENT_PM_BR_MPRED_TAKEN_CR 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_MPRED_TAKEN_CR 0x40
+
+EVENT_PM_BR_MPRED_TAKEN_TA 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BR_MPRED_TAKEN_TA 0x48
+
+EVENT_PM_LINK_STACK_WRONG_ADD_PRED 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LINK_STACK_WRONG_ADD_PRED 0x50
+
+EVENT_PM_LINK_STACK_INVALID_PTR 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LINK_STACK_INVALID_PTR 0x58
+
+EVENT_PM_LINK_STACK_CORRECT 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LINK_STACK_CORRECT 0x58
+
+EVENT_PM_FLUSH_MPRED 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_MPRED 0x50
+
+EVENT_PM_EAT_FORCE_MISPRED 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_EAT_FORCE_MISPRED 0x50
+
+EVENT_PM_BTAC_BAD_RESULT 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BTAC_BAD_RESULT 0x50
+
+EVENT_PM_BTAC_GOOD_RESULT 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_BTAC_GOOD_RESULT 0x58
+
+EVENT_PM_TAGE_CORRECT_TAKEN_CMPL 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_TAGE_CORRECT_TAKEN_CMPL 0x50
+
+EVENT_PM_TAGE_CORRECT 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_TAGE_CORRECT 0x58
+
+EVENT_PM_TAGE_OVERRIDE_WRONG 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_TAGE_OVERRIDE_WRONG 0x50
+
+EVENT_PM_TAGE_OVERRIDE_WRONG_SPEC 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_TAGE_OVERRIDE_WRONG_SPEC 0x58
+
+
+#################
+# ISU Events
+#################
+
+
+EVENT_PM_DISP_HELD 0x06 PMC0
+UMASK_PM_DISP_HELD 0x00
+
+EVENT_PM_STALL_END_ICT_EMPTY 0x28 PMC0
+UMASK_PM_STALL_END_ICT_EMPTY 0x00
+
+EVENT_PM_1PLUS_PPC_CMPL 0xF2 PMC0
+UMASK_PM_1PLUS_PPC_CMPL 0x00
+
+EVENT_PM_DISP_HELD_ISSQ_FULL 0x06 PMC1
+UMASK_PM_DISP_HELD_ISSQ_FULL 0x00
+
+EVENT_PM_ICT_EMPTY_CYC 0x08 PMC1
+UMASK_PM_ICT_EMPTY_CYC 0x00
+
+EVENT_PM_NTC_ALL_FIN 0x1A PMC1
+UMASK_PM_NTC_ALL_FIN 0x00
+
+EVENT_PM_NTC_ISSUE_HELD_ARB 0x16 PMC1
+UMASK_PM_NTC_ISSUE_HELD_ARB 0xE0
+
+EVENT_PM_IOPS_CMPL 0x50 PMC1
+UMASK_PM_IOPS_CMPL 0x40
+
+EVENT_PM_NTC_FIN 0x5A PMC1
+UMASK_PM_NTC_FIN 0x40
+
+EVENT_PM_DISP_STARVED 0x08 PMC2
+UMASK_PM_DISP_STARVED 0x00
+
+EVENT_PM_FLUSH_COMPLETION 0x12 PMC2
+UMASK_PM_FLUSH_COMPLETION 0x00
+
+EVENT_PM_ISQ_0_8_ENTRIES 0x5A PMC2
+UMASK_PM_ISQ_0_8_ENTRIES 0x00
+
+EVENT_PM_NTC_ISSUE_HELD_OTHER 0x5A PMC2
+UMASK_PM_NTC_ISSUE_HELD_OTHER 0xD0
+
+EVENT_PM_DISP_HELD_HB_FULL 0x5C PMC2
+UMASK_PM_DISP_HELD_HB_FULL 0xD0
+
+EVENT_PM_ISQ_36_44_ENTRIES 0x0A PMC3
+UMASK_PM_ISQ_36_44_ENTRIES 0x00
+
+EVENT_PM_1PLUS_PPC_DISP 0xF2 PMC3
+UMASK_PM_1PLUS_PPC_DISP 0x00
+
+EVENT_PM_FLUSH 0xF8 PMC3
+UMASK_PM_FLUSH 0x00
+
+EVENT_PM_DARQ1_10_12_ENTRIES 0x58 PMC1
+UMASK_PM_DARQ1_10_12_ENTRIES 0x00
+
+EVENT_PM_DARQ1_7_9_ENTRIES 0x5A PMC1
+UMASK_PM_DARQ1_7_9_ENTRIES 0x00
+
+EVENT_PM_DARQ1_4_6_ENTRIES 0x50 PMC2
+UMASK_PM_DARQ1_4_6_ENTRIES 0xE0
+
+EVENT_PM_DARQ1_0_3_ENTRIES 0x22 PMC3
+UMASK_PM_DARQ1_0_3_ENTRIES 0xC1
+
+EVENT_PM_EE_OFF_EXT_INT 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_EE_OFF_EXT_INT 0x20
+
+EVENT_PM_FLUSH_DISP 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_DISP 0x28
+
+EVENT_PM_FLUSH_HB_RESTORE_CYC 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_HB_RESTORE_CYC 0x20
+
+EVENT_PM_ISYNC 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISYNC 0x28
+
+EVENT_PM_FLUSH_DISP_SB 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_DISP_SB 0x20
+
+EVENT_PM_FLUSH_DISP_TLBIE 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_FLUSH_DISP_TLBIE 0x28
+
+EVENT_PM_CLB_HELD 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_CLB_HELD 0x20
+
+EVENT_PM_DISP_CLB_HELD_BAL 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DISP_CLB_HELD_BAL 0x28
+
+EVENT_PM_DISP_CLB_HELD_SB 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DISP_CLB_HELD_SB 0x20
+
+EVENT_PM_DISP_CLB_HELD_TLBIE 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DISP_CLB_HELD_TLBIE 0x28
+
+EVENT_PM_LSU_FLUSH_NEXT 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_NEXT 0x20
+
+EVENT_PM_DISP_HELD_TBEGIN 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DISP_HELD_TBEGIN 0x28
+
+EVENT_PM_ISU0_ISS_HOLD_ALL 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISU0_ISS_HOLD_ALL 0x30
+
+EVENT_PM_ISU2_ISS_HOLD_ALL 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISU2_ISS_HOLD_ALL 0x38
+
+EVENT_PM_ISU1_ISS_HOLD_ALL 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISU1_ISS_HOLD_ALL 0x30
+
+EVENT_PM_ISU3_ISS_HOLD_ALL 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISU3_ISS_HOLD_ALL 0x38
+
+
+
+#################
+# VSU Events
+#################
+
+
+EVENT_PM_VECTOR_LD_CMPL 0x54 PMC3
+UMASK_PM_VECTOR_LD_CMPL 0x40
+
+EVENT_PM_VECTOR_ST_CMPL 0x56 PMC3
+UMASK_PM_VECTOR_ST_CMPL 0x40
+
+EVENT_PM_FXU_BUSY 0x0E PMC1
+UMASK_PM_FXU_BUSY 0x00
+
+EVENT_PM_FXU_IDLE 0x52 PMC1
+UMASK_PM_FXU_IDLE 0x40
+
+EVENT_PM_VSU_FIN 0x5C PMC1
+UMASK_PM_VSU_FIN 0x50
+
+EVENT_PM_FXU_1PLUS_BUSY 0x0E PMC2
+UMASK_PM_FXU_1PLUS_BUSY 0x00
+
+EVENT_PM_VSU_DP_FSQRT_FDIV 0x58 PMC2
+UMASK_PM_VSU_DP_FSQRT_FDIV 0xD0
+
+EVENT_PM_FXU_FIN 0x40 PMC3
+UMASK_PM_FXU_FIN 0x00
+
+EVENT_PM_DFU_BUSY 0x4C PMC3
+UMASK_PM_DFU_BUSY 0xD0
+
+EVENT_PM_VSU_FSQRT_FDIV 0x4E PMC3
+UMASK_PM_VSU_FSQRT_FDIV 0xD0
+
+EVENT_PM_VSU_NON_FLOP_CMPL 0x50 PMC3
+UMASK_PM_VSU_NON_FLOP_CMPL 0xD0
+
+EVENT_PM_1FLOP_CMPL 0x50 PMC3
+UMASK_PM_1FLOP_CMPL 0x50
+
+EVENT_PM_2FLOP_CMPL 0x52 PMC3
+UMASK_PM_2FLOP_CMPL 0xD0
+
+EVENT_PM_4FLOP_CMPL 0x52 PMC3
+UMASK_PM_4FLOP_CMPL 0x50
+
+EVENT_PM_8FLOP_CMPL 0x54 PMC3
+UMASK_PM_8FLOP_CMPL 0xD0
+
+EVENT_PM_NON_FMA_FLOP_CMPL 0x56 PMC3
+UMASK_PM_NON_FMA_FLOP_CMPL 0xD0
+
+EVENT_PM_FMA_CMPL 0x54 PMC3
+UMASK_PM_FMA_CMPL 0x50
+
+EVENT_PM_SCALAR_FLOP_CMPL 0x56 PMC3
+UMASK_PM_SCALAR_FLOP_CMPL 0x50
+
+EVENT_PM_VECTOR_FLOP_CMPL 0x58 PMC3
+UMASK_PM_VECTOR_FLOP_CMPL 0xD0
+
+EVENT_PM_NON_MATH_FLOP_CMPL 0x5A PMC3
+UMASK_PM_NON_MATH_FLOP_CMPL 0xD0
+
+EVENT_PM_DP_QP_FLOP_CMPL 0x5C PMC3
+UMASK_PM_DP_QP_FLOP_CMPL 0xD0
+
+EVENT_PM_IC_MISS_CMPL 0x58 PMC3
+UMASK_PM_IC_MISS_CMPL 0x50
+
+EVENT_PM_SP_FLOP_CMPL 0x5A PMC3
+UMASK_PM_SP_FLOP_CMPL 0x50
+
+EVENT_PM_MATH_FLOP_CMPL 0x5C PMC3
+UMASK_PM_MATH_FLOP_CMPL 0x50
+
+EVENT_PM_FLOP_CMPL 0x5E PMC3
+UMASK_PM_FLOP_CMPL 0x50
+
+
+#################
+# LSU Events
+#################
+
+
+EVENT_PM_LSU_SRQ_FULL_CYC 0x1A PMC0
+UMASK_PM_LSU_SRQ_FULL_CYC 0x00
+
+EVENT_PM_L1_DCACHE_RELOADED_ALL 0x2C PMC0
+UMASK_PM_L1_DCACHE_RELOADED_ALL 0x00
+
+EVENT_PM_LMQ_MERGE 0x2E PMC0
+UMASK_PM_LMQ_MERGE 0x00
+
+EVENT_PM_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_CHIP_PUMP_CPRED 0x00
+
+EVENT_PM_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_GRP_PUMP_MPRED_RTY 0x00
+
+EVENT_PM_PUMP_CPRED 0x54 PMC0
+UMASK_PM_PUMP_CPRED 0x00
+
+EVENT_PM_MEM_READ 0x56 PMC0
+UMASK_PM_MEM_READ 0x00
+
+EVENT_PM_MEM_LOC_THRESH_IFU 0x58 PMC0
+UMASK_PM_MEM_LOC_THRESH_IFU 0x00
+
+EVENT_PM_MEM_LOC_THRESH_LSU_MED 0x5E PMC0
+UMASK_PM_MEM_LOC_THRESH_LSU_MED 0xC0
+
+EVENT_PM_DARQ0_10_12_ENTRIES 0x58 PMC0
+UMASK_PM_DARQ0_10_12_ENTRIES 0xD0
+
+EVENT_PM_STCX_FAIL 0x58 PMC0
+UMASK_PM_STCX_FAIL 0xE0
+
+EVENT_PM_LD_L3MISS_PEND_CYC 0x62 PMC0
+UMASK_PM_LD_L3MISS_PEND_CYC 0x00
+
+EVENT_PM_NTC_ISSUE_HELD_DARQ_FULL 0x6A PMC0
+UMASK_PM_NTC_ISSUE_HELD_DARQ_FULL 0x00
+
+EVENT_PM_RUN_CYC_ST_MODE 0x6C PMC0
+UMASK_PM_RUN_CYC_ST_MODE 0x00
+
+EVENT_PM_LD_REF_L1 0xFC PMC0
+UMASK_PM_LD_REF_L1 0x00
+
+EVENT_PM_ST_FIN 0x16 PMC1
+UMASK_PM_ST_FIN 0x00
+
+EVENT_PM_ST_FWD 0x18 PMC1
+UMASK_PM_ST_FWD 0x00
+
+EVENT_PM_STCX_FIN 0x14 PMC1
+UMASK_PM_STCX_FIN 0xE0
+
+EVENT_PM_LSU_LMQ_SRQ_EMPTY_CYC 0x3E PMC1
+UMASK_PM_LSU_LMQ_SRQ_EMPTY_CYC 0x00
+
+EVENT_PM_LD_MISS_L1_FIN 0x4E PMC1
+UMASK_PM_LD_MISS_L1_FIN 0xC0
+
+EVENT_PM_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_GRP_PUMP_CPRED 0x00
+
+EVENT_PM_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_GRP_PUMP_MPRED 0x00
+
+EVENT_PM_MEM_PREF 0x58 PMC1
+UMASK_PM_MEM_PREF 0xC0
+
+EVENT_PM_DARQ0_7_9_ENTRIES 0x50 PMC1
+UMASK_PM_DARQ0_7_9_ENTRIES 0xE0
+
+EVENT_PM_LRQ_REJECT 0x5A PMC1
+UMASK_PM_LRQ_REJECT 0xE0
+
+EVENT_PM_LSU_REJECT_ERAT_MISS 0x5C PMC1
+UMASK_PM_LSU_REJECT_ERAT_MISS 0xE0
+
+EVENT_PM_LMQ_EMPTY_CYC 0x5E PMC1
+UMASK_PM_LMQ_EMPTY_CYC 0xE0
+
+EVENT_PM_ST_CMPL 0xF0 PMC1
+UMASK_PM_ST_CMPL 0x00
+
+EVENT_PM_LSU_REJECT_LMQ_FULL 0x1C PMC2
+UMASK_PM_LSU_REJECT_LMQ_FULL 0x00
+
+EVENT_PM_DARQ0_4_6_ENTRIES 0x4E PMC2
+UMASK_PM_DARQ0_4_6_ENTRIES 0x50
+
+EVENT_PM_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_SYS_PUMP_CPRED 0x00
+
+EVENT_PM_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_SYS_PUMP_MPRED 0x00
+
+EVENT_PM_TLBIE_FIN 0x58 PMC2
+UMASK_PM_TLBIE_FIN 0x00
+
+EVENT_PM_LARX_FIN 0x58 PMC2
+UMASK_PM_LARX_FIN 0xC0
+
+EVENT_PM_MEM_RWITM 0x5E PMC2
+UMASK_PM_MEM_RWITM 0xC0
+
+EVENT_PM_PARTIAL_ST_FIN 0x54 PMC2
+UMASK_PM_PARTIAL_ST_FIN 0x40
+
+EVENT_PM_LD_MISS_L1 0x54 PMC2
+UMASK_PM_LD_MISS_L1 0xE0
+
+EVENT_PM_DARQ_STORE_XMIT 0x64 PMC2
+UMASK_PM_DARQ_STORE_XMIT 0x00
+
+EVENT_PM_LSU_FIN 0x66 PMC2
+UMASK_PM_LSU_FIN 0x00
+
+EVENT_PM_ST_MISS_L1 0xF0 PMC2
+UMASK_PM_ST_MISS_L1 0x00
+
+EVENT_PM_L1_DCACHE_RELOAD_VALID 0xF6 PMC2
+UMASK_PM_L1_DCACHE_RELOAD_VALID 0x00
+
+EVENT_PM_DTLB_MISS 0xFC PMC2
+UMASK_PM_DTLB_MISS 0x00
+
+EVENT_PM_SRQ_EMPTY_CYC 0x08 PMC3
+UMASK_PM_SRQ_EMPTY_CYC 9x99
+
+EVENT_PM_LD_CMPL 0x3E PMC3
+UMASK_PM_LD_CMPL 0x00
+
+EVENT_PM_DARQ0_0_3_ENTRIES 0x4A PMC3
+UMASK_PM_DARQ0_0_3_ENTRIES 0xD0
+
+EVENT_PM_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_SYS_PUMP_MPRED_RTY 0x00
+
+EVENT_PM_PUMP_MPRED 0x52 PMC3
+UMASK_PM_PUMP_MPRED 0x00
+
+EVENT_PM_MEM_LOC_THRESH_LSU_HIGH 0x56 PMC3
+UMASK_PM_MEM_LOC_THRESH_LSU_HIGH 0x00
+
+EVENT_PM_MEM_CO 0x58 PMC3
+UMASK_PM_MEM_CO 0xC0
+
+EVENT_PM_DARQ_STORE_REJECT 0x5E PMC3
+UMASK_PM_DARQ_STORE_REJECT 0x40
+
+EVENT_PM_LSU_REJECT_LHS 0x5C PMC3
+UMASK_PM_LSU_REJECT_LHS 0xE0
+
+EVENT_PM_LD_MISS_L1 0xF0 PMC3
+UMASK_PM_LD_MISS_L1 0x00
+
+EVENT_PM_LS0_LD_VECTOR_FIN 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_LD_VECTOR_FIN 0xC0
+
+EVENT_PM_LS1_LD_VECTOR_FIN 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_LD_VECTOR_FIN 0xC8
+
+EVENT_PM_LS2_LD_VECTOR_FIN 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS2_LD_VECTOR_FIN 0xC0
+
+EVENT_PM_LS3_LD_VECTOR_FIN 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS3_LD_VECTOR_FIN 0xC8
+
+EVENT_PM_LSU_DTLB_MISS_4K 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_DTLB_MISS_4K 0xC0
+
+EVENT_PM_LSU_DTLB_MISS_64K 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_DTLB_MISS_64K 0xC8
+
+EVENT_PM_LSU_DTLB_MISS_16M_2M 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_DTLB_MISS_16M_2M 0xC0
+
+EVENT_PM_LSU_DTLB_MISS_16G_1G 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_DTLB_MISS_16G_1G 0xC8
+
+EVENT_PM_LSU_STCX 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_STCX 0xC0
+
+EVENT_PM_LSU_NCST 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_NCST 0xC8
+
+EVENT_PM_LS0_UNALIGNED_LD 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_UNALIGNED_LD 0xC0
+
+EVENT_PM_LS1_UNALIGNED_LD 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_UNALIGNED_LD 0xC8
+
+EVENT_PM_LS2_UNALIGNED_LD 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS2_UNALIGNED_LD 0xC0
+
+EVENT_PM_LS3_UNALIGNED_LD 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS3_UNALIGNED_LD 0xC0
+
+EVENT_PM_LS0_UNALIGNED_ST 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_UNALIGNED_ST 0xF0
+
+EVENT_PM_LS1_UNALIGNED_ST 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_UNALIGNED_ST 0xF8
+
+EVENT_PM_LS2_UNALIGNED_ST 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS2_UNALIGNED_ST 0xF0
+
+EVENT_PM_LS3_UNALIGNED_ST 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS3_UNALIGNED_ST 0xF8
+
+EVENT_PM_LS0_LAUNCH_HELD_PREF 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_LAUNCH_HELD_PREF 0xC0
+
+EVENT_PM_LS1_LAUNCH_HELD_PREF 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_LAUNCH_HELD_PREF 0xC8
+
+EVENT_PM_LSU0_FALSE_LHS 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_FALSE_LHS 0xC0
+
+EVENT_PM_LSU1_FALSE_LHS 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_FALSE_LHS 0xC8
+
+EVENT_PM_LSU2_FALSE_LHS 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_FALSE_LHS 0xC0
+
+EVENT_PM_LSU3_FALSE_LHS 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_FALSE_LHS 0xC8
+
+EVENT_PM_LSU_FLUSH_CI 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_CI 0xC0
+
+EVENT_PM_LSU_FLUSH_ATOMIC 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_ATOMIC 0xC8
+
+EVENT_PM_LSU_FLUSH_EMSH 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_EMSH 0xC0
+
+EVENT_PM_LSU_FLUSH_RELAUNCH_MISS 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_RELAUNCH_MISS 0xC8
+
+EVENT_PM_LSU_FLUSH_UE 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_UE 0xC0
+
+EVENT_PM_LSU_FLUSH_LHS 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LHS 0xC8
+
+EVENT_PM_LSU_FLUSH_WRK_ARND 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_WRK_ARND 0xC0
+
+EVENT_PM_LSU_FLUSH_LHL_SHL 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LHL_SHL 0xC9
+
+EVENT_PM_LSU_FLUSH_SAO 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_SAO 0xC0
+
+EVENT_PM_LSU_FLUSH_LARX_STCX 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LARX_STCX 0xC8
+
+EVENT_PM_LSU_FLUSH_OTHER 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_OTHER 0xC0
+
+EVENT_PM_STCX_SUCCESS_CMPL 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_STCX_SUCCESS_CMPL 0xC8
+
+EVENT_PM_LSU0_SET_MPRED 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_SET_MPRED 0xD0
+
+EVENT_PM_LSU1_SET_MPRED 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_SET_MPRED 0xD8
+
+EVENT_PM_LSU2_SET_MPRED 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_SET_MPRED 0xD0
+
+EVENT_PM_LSU3_SET_MPRED 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_SET_MPRED 0xD8
+
+EVENT_PM_LSU0_LDMX_FIN 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_LDMX_FIN 0xD0
+
+EVENT_PM_LSU1_LDMX_FIN 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_LDMX_FIN 0xD8
+
+EVENT_PM_LSU2_LDMX_FIN 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_LDMX_FIN 0xD0
+
+EVENT_PM_LSU3_LDMX_FIN 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_LDMX_FIN 0xD8
+
+EVENT_PM_LS0_DC_COLLISIONS 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_DC_COLLISIONS 0xD0
+
+EVENT_PM_LS1_DC_COLLISIONS 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_DC_COLLISIONS 0xD8
+
+EVENT_PM_LS2_DC_COLLISIONS 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS2_DC_COLLISIONS 0xD0
+
+EVENT_PM_LS3_DC_COLLISIONS 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS3_DC_COLLISIONS 0xD8
+
+EVENT_PM_LSU_FLUSH_ATOMIC 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_ATOMIC 0xD1
+
+EVENT_PM_LSU_FLUSH_EMSH 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_EMSH 0xD9
+
+EVENT_PM_LSU_FLUSH_RELAUNCH_MISS 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_RELAUNCH_MISS 0xD1
+
+EVENT_PM_LSU_FLUSH_UE 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_UE 0xD9
+
+EVENT_PM_LSU_FLUSH_LHS 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LHS 0xD1
+
+EVENT_PM_LSU_FLUSH_LHL_SHL 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LHL_SHL 0xD9
+
+EVENT_PM_LSU_FLUSH_SAO 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_SAO 0xD1
+
+EVENT_PM_LSU_FLUSH_LARX_STCX 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_FLUSH_LARX_STCX 0xD9
+
+EVENT_PM_SRQ_SYNC_CYC 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_SRQ_SYNC_CYC 0xD0
+
+EVENT_PM_LSU0_SRQ_S0_VALID_CYC 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_SRQ_S0_VALID_CYC 0xD0
+
+EVENT_PM_LSU0_LRQ_S0_VALID_CYC 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_LRQ_S0_VALID_CYC 0xD8
+
+EVENT_PM_LSU_LMQ_FULL_CYC 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_LMQ_FULL_CYC 0xD0
+
+EVENT_PM_LSU0_LMQ_S0_VALID 0xB8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_LMQ_S0_VALID 0xD8
+
+EVENT_PM_LSU0_1_LRQF_FULL_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_1_LRQF_FULL_CYC 0xD0
+
+EVENT_PM_LSU2_3_LRQF_FULL_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_3_LRQF_FULL_CYC 0xD8
+
+EVENT_PM_S2Q_FULL 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_S2Q_FULL 0xE0
+
+EVENT_PM_L1_SW_PREF 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_L1_SW_PREF 0xE8
+
+EVENT_PM_LS0_ERAT_MISS_PREF 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_ERAT_MISS_PREF 0xE0
+
+EVENT_PM_LS1_ERAT_MISS_PREF 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_ERAT_MISS_PREF 0xE8
+
+EVENT_PM_LS2_ERAT_MISS_PREF 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS2_ERAT_MISS_PREF 0xE0
+
+EVENT_PM_LS3_ERAT_MISS_PREF 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS3_ERAT_MISS_PREF 0xE8
+
+EVENT_PM_LSU0_ERAT_HIT 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_ERAT_HIT 0xE0
+
+EVENT_PM_LSU1_ERAT_HIT 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_ERAT_HIT 0xE8
+
+EVENT_PM_LSU2_ERAT_HIT 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_ERAT_HIT 0xE0
+
+EVENT_PM_LSU3_ERAT_HIT 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_ERAT_HIT 0xE8
+
+EVENT_PM_LSU0_TM_L1_MISS 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_TM_L1_MISS 0xE0
+
+EVENT_PM_LSU1_TM_L1_MISS 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_TM_L1_MISS 0xE8
+
+EVENT_PM_LSU2_TM_L1_MISS 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_TM_L1_MISS 0xE0
+
+EVENT_PM_LSU3_TM_L1_MISS 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_TM_L1_MISS 0xE8
+
+EVENT_PM_LSU_STCX_FAIL 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU_STCX_FAIL 0xF0
+
+EVENT_PM_SNOOP_TLBIE 0x80 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_SNOOP_TLBIE 0xF8
+
+EVENT_PM_PTE_PREFETCH 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_PTE_PREFETCH 0xF0
+
+EVENT_PM_LSU0_STORE_REJECT 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_STORE_REJECT 0xF0
+
+EVENT_PM_LSU1_STORE_REJECT 0x88 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_STORE_REJECT 0xF8
+
+EVENT_PM_LSU2_STORE_REJECT 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_STORE_REJECT 0xF0
+
+EVENT_PM_LSU3_STORE_REJECT 0x8C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_STORE_REJECT 0xF8
+
+EVENT_PM_LSU0_L1_CAM_CANCEL 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU0_L1_CAM_CANCEL 0xF0
+
+EVENT_PM_LSU1_L1_CAM_CANCEL 0x90 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU1_L1_CAM_CANCEL 0xF8
+
+EVENT_PM_LSU2_L1_CAM_CANCEL 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU2_L1_CAM_CANCEL 0xF0
+
+EVENT_PM_LSU3_L1_CAM_CANCEL 0x94 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LSU3_L1_CAM_CANCEL 0xF8
+
+EVENT_PM_DATA_STORE 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DATA_STORE 0xF0
+
+EVENT_PM_NON_DATA_STORE 0xA0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_NON_DATA_STORE 0xF8
+
+EVENT_PM_DC_PREF_HW_ALLOC 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_HW_ALLOC 0xF0
+
+EVENT_PM_DC_PREF_SW_ALLOC 0xA4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_SW_ALLOC 0xF8
+
+EVENT_PM_DC_PREF_CONF 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_CONF 0xF0
+
+EVENT_PM_DC_PREF_FUZZY_CONF 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_FUZZY_CONF 0xF9
+
+EVENT_PM_DC_PREF_STRIDED_CONF 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_STRIDED_CONF 0xF0
+
+EVENT_PM_DC_DEALLOC_NO_CONF 0xAC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_DEALLOC_NO_CONF 0xF8
+
+EVENT_PM_L3_LD_PREF 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_L3_LD_PREF 0xF0
+
+EVENT_PM_L3_SW_PREF 0xB0 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_L3_SW_PREF 0xF8
+
+EVENT_PM_DC_PREF_CONS_ALLOC 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_CONS_ALLOC 0xF0
+
+EVENT_PM_DC_PREF_XCONS_ALLOC 0xB4 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DC_PREF_XCONS_ALLOC 0xF8
+
+EVENT_PM_L1_PREF 0x54 PMC1
+UMASK_PM_L1_PREF 0x00
+
+
+#################
+# Data Source Events
+#################
+
+EVENT_PM_DATA_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_DATA_FROM_L2_NO_CONFLICT 0xC0
+
+EVENT_PM_DATA_FROM_L2 0x42 PMC0
+UMASK_PM_DATA_FROM_L2 0xC0
+
+EVENT_PM_DATA_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_DATA_FROM_L3_NO_CONFLICT 0xC0
+
+EVENT_PM_DATA_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_DATA_FROM_L31_SHR 0xC0
+
+EVENT_PM_DATA_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_DATA_FROM_ON_CHIP_CACHE 0xC0
+
+EVENT_PM_DATA_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_DATA_FROM_RL2L3_SHR 0xC0
+
+EVENT_PM_DATA_FROM_LL4 0x4C
+UMASK_PM_DATA_FROM_LL4 0xC0
+
+EVENT_PM_DATA_FROM_L2MISS_MOD 0x4E PMC0
+UMASK_PM_DATA_FROM_L2MISS_MOD 0xC0
+
+EVENT_PM_DATA_CHIP_PUMP_CPRED 0x50 PMC0
+UMASK_PM_DATA_CHIP_PUMP_CPRED 0xC0
+
+EVENT_PM_DATA_GRP_PUMP_MPRED_RTY 0x52 PMC0
+UMASK_PM_DATA_GRP_PUMP_MPRED_RTY 0xC0
+
+EVENT_PM_DATA_PUMP_CPRED 0x54 PMC0
+UMASK_PM_DATA_PUMP_CPRED 0xC0
+
+EVENT_PM_DATA_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_DATA_FROM_L2_MEPF 0xC0
+
+EVENT_PM_DATA_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_DATA_FROM_L3_MEPF 0xC0
+
+EVENT_PM_DATA_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_DATA_FROM_L31_MOD 0xC0
+
+EVENT_PM_DATA_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_DATA_FROM_RL2L3_MOD 0xC0
+
+EVENT_PM_DATA_FROM_LMEM 0x48 PMC1
+UMASK_PM_DATA_FROM_LMEM 0xC0
+
+EVENT_PM_DATA_FROM_RL4 0x4A PMC1
+UMASK_PM_DATA_FROM_RL4 0xC0
+
+EVENT_PM_DATA_GRP_PUMP_CPRED 0x50 PMC1
+UMASK_PM_DATA_GRP_PUMP_CPRED 0xC0
+
+EVENT_PM_DATA_GRP_PUMP_MPRED 0x52 PMC1
+UMASK_PM_DATA_GRP_PUMP_MPRED 0xC0
+
+EVENT_PM_DATA_FROM_L2MISS 0xFE PMC1
+UMASK_PM_DATA_FROM_L2MISS 0x00
+
+EVENT_PM_DATA_TABLEWALK_CYC 0x1A PMC2
+UMASK_PM_DATA_TABLEWALK_CYC 0x00
+
+EVENT_PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0x40 PMC2
+UMASK_PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST 0xC0
+
+EVENT_PM_DATA_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_DATA_FROM_L3_DISP_CONFLICT 0xC0
+
+EVENT_PM_DATA_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_DATA_FROM_L31_ECO_SHR 0xC0
+
+EVENT_PM_DATA_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_DATA_FROM_L21_SHR 0xC0
+
+EVENT_PM_DATA_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_DATA_FROM_DL2L3_SHR 0xC0
+
+EVENT_PM_DATA_FROM_RMEM 0x4A PMC2
+UMASK_PM_DATA_FROM_RMEM 0xC0
+
+EVENT_PM_DATA_FROM_DL4 0x4C PMC2
+UMASK_PM_DATA_FROM_DL4 0xC0
+
+EVENT_PM_DATA_SYS_PUMP_CPRED 0x50 PMC2
+UMASK_PM_DATA_SYS_PUMP_CPRED 0xC0
+
+EVENT_PM_DATA_SYS_PUMP_MPRED 0x52 PMC2
+UMASK_PM_DATA_SYS_PUMP_MPRED 0xC0
+
+EVENT_PM_DATA_FROM_L3MISS 0xFE PMC2
+UMASK_PM_DATA_FROM_L3MISS 0x00
+
+EVENT_PM_DATA_FROM_L2_DISP_CONFLICT_OTHER 0x40 PMC3
+UMASK_PM_DATA_FROM_L2_DISP_CONFLICT_OTHER 0xC0
+
+EVENT_PM_DATA_FROM_L3 0x42 PMC3
+UMASK_PM_DATA_FROM_L3 0xC0
+
+EVENT_PM_DATA_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_DATA_FROM_L31_ECO_MOD 0xC0
+
+EVENT_PM_DATA_FROM_L21_ECO_MOD 0x46 PMC3
+UMASK_PM_DATA_FROM_L21_ECO_MOD 0xC0
+
+EVENT_PM_DATA_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_DATA_FROM_DL2L3_MOD 0xC0
+
+EVENT_PM_DATA_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_DATA_FROM_OFF_CHIP_CACHE 0xC0
+
+EVENT_PM_DATA_FROM_DMEM 0x4C PMC3
+UMASK_PM_DATA_FROM_DMEM 0xC0
+
+EVENT_PM_DATA_FROM_L3MISS_MOD 0x4E PMC3
+UMASK_PM_DATA_FROM_L3MISS_MOD 0xC0
+
+EVENT_PM_DPTEG_FROM_L3MISS 0x4E PMC3
+UMASK_PM_DPTEG_FROM_L3MISS 0xE0
+
+EVENT_PM_DATA_SYS_PUMP_MPRED_RTY 0x50 PMC3
+UMASK_PM_DATA_SYS_PUMP_MPRED_RTY 0xC0
+
+EVENT_PM_DATA_PUMP_MPRED 0x52 PMC3
+UMASK_PM_DATA_PUMP_MPRED 0xC0
+
+EVENT_PM_DATA_FROM_MEMORY 0xFE PMC3
+UMASK_PM_DATA_FROM_MEMORY 0x00
+
+EVENT_PM_L3_CO_MEPF 0x5E PMC2
+UMASK_PM_L3_CO_MEPF 0xE0
+
+
+#################
+# Translation Events
+#################
+
+EVENT_PM_IPTEG_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_IPTEG_FROM_L2_NO_CONFLICT 0x50
+
+EVENT_PM_IPTEG_FROM_L2 0x42 PMC0
+UMASK_PM_IPTEG_FROM_L2 0x50
+
+EVENT_PM_IPTEG_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_IPTEG_FROM_L3_NO_CONFLICT 0x50
+
+EVENT_PM_IPTEG_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_IPTEG_FROM_L31_SHR 0x50
+
+EVENT_PM_IPTEG_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_IPTEG_FROM_ON_CHIP_CACHE 0x50
+
+EVENT_PM_IPTEG_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_IPTEG_FROM_RL2L3_SHR 0x50
+
+EVENT_PM_IPTEG_FROM_LL4 0x4C PMC0
+UMASK_PM_IPTEG_FROM_LL4 0x50
+
+EVENT_PM_IPTEG_FROM_L2MISS 0x4E PMC0
+UMASK_PM_IPTEG_FROM_L2MISS 0x50
+
+EVENT_PM_IERAT_RELOAD 0xF6 PMC0
+UMASK_PM_IERAT_RELOAD 0x00
+
+EVENT_PM_IPTEG_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_IPTEG_FROM_L2_MEPF 0x50
+
+EVENT_PM_IPTEG_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_IPTEG_FROM_L3_MEPF 0x50
+
+EVENT_PM_IPTEG_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_IPTEG_FROM_L31_MOD 0x50
+
+EVENT_PM_IPTEG_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_IPTEG_FROM_RL2L3_MOD 0x50
+
+EVENT_PM_IPTEG_FROM_LMEM 0x48 PMC1
+UMASK_PM_IPTEG_FROM_LMEM 0x50
+
+EVENT_PM_IPTEG_FROM_RL4 0x4A PMC1
+UMASK_PM_IPTEG_FROM_RL4 0x50
+
+EVENT_PM_IPTEG_FROM_MEMORY 0x4C PMC1
+UMASK_PM_IPTEG_FROM_MEMORY 0x50
+
+EVENT_PM_IERAT_RELOAD_4K 0x64 PMC1
+UMASK_PM_IERAT_RELOAD_4K 0x00
+
+EVENT_PM_IPTEG_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_IPTEG_FROM_L3_DISP_CONFLICT 0x50
+
+EVENT_PM_IPTEG_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_IPTEG_FROM_L31_ECO_SHR 0x50
+
+EVENT_PM_IPTEG_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_IPTEG_FROM_L21_SHR 0x50
+
+EVENT_PM_IPTEG_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_IPTEG_FROM_DL2L3_SHR 0x50
+
+EVENT_PM_IPTEG_FROM_RMEM 0x4A PMC2
+UMASK_PM_IPTEG_FROM_RMEM 0x50
+
+EVENT_PM_IPTEG_FROM_DL4 0x4C PMC2
+UMASK_PM_IPTEG_FROM_DL4 0x50
+
+EVENT_PM_IERAT_RELOAD_64K 0x6A PMC2
+UMASK_PM_IERAT_RELOAD_64K 0x00
+
+EVENT_PM_ISLB_MISS 0x06 PMC3
+UMASK_PM_ISLB_MISS 0x00
+
+EVENT_PM_DERAT_MISS_16G_1G 0x54 PMC3
+UMASK_PM_DERAT_MISS_16G_1G 0xC0
+
+EVENT_PM_IPTEG_FROM_L3 0x42 PMC3
+UMASK_PM_IPTEG_FROM_L3 0x50
+
+EVENT_PM_IPTEG_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_IPTEG_FROM_L31_ECO_MOD 0x50
+
+EVENT_PM_IPTEG_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_IPTEG_FROM_L21_MOD 0x50
+
+EVENT_PM_IPTEG_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_IPTEG_FROM_DL2L3_MOD 0x50
+
+EVENT_PM_IPTEG_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_IPTEG_FROM_OFF_CHIP_CACHE 0x50
+
+EVENT_PM_IPTEG_FROM_DMEM 0x4C PMC3
+UMASK_PM_IPTEG_FROM_DMEM 0x50
+
+EVENT_PM_IPTEG_FROM_L3MISS 0x4E PMC3
+UMASK_PM_IPTEG_FROM_L3MISS 0x50
+
+EVENT_PM_IERAT_RELOAD_16M 0x6A PMC3
+UMASK_PM_IERAT_RELOAD_16M 0x00
+
+EVENT_PM_ITLB_MISS 0xFC PMC3
+UMASK_PM_ITLB_MISS 0x00
+
+EVENT_PM_DPTEG_FROM_L2_NO_CONFLICT 0x40 PMC0
+UMASK_PM_DPTEG_FROM_L2_NO_CONFLICT 0xE0
+
+EVENT_PM_DPTEG_FROM_L2 0x42 PMC0
+UMASK_PM_DPTEG_FROM_L2 0xE0
+
+EVENT_PM_DPTEG_FROM_L3_NO_CONFLICT 0x44 PMC0
+UMASK_PM_DPTEG_FROM_L3_NO_CONFLICT 0xE0
+
+EVENT_PM_DPTEG_FROM_L31_SHR 0x46 PMC0
+UMASK_PM_DPTEG_FROM_L31_SHR 0xE0
+
+EVENT_PM_DPTEG_FROM_ON_CHIP_CACHE 0x48 PMC0
+UMASK_PM_DPTEG_FROM_ON_CHIP_CACHE 0xE0
+
+EVENT_PM_DPTEG_FROM_RL2L3_SHR 0x4A PMC0
+UMASK_PM_DPTEG_FROM_RL2L3_SHR 0xE0
+
+EVENT_PM_DPTEG_FROM_LL4 0x4C PMC0
+UMASK_PM_DPTEG_FROM_LL4 0xE0
+
+EVENT_PM_DPTEG_FROM_L2MISS 0x4E PMC0
+UMASK_PM_DPTEG_FROM_L2MISS 0xE0
+
+EVENT_PM_DPTEG_FROM_L2_MEPF 0x40 PMC1
+UMASK_PM_DPTEG_FROM_L2_MEPF 0xE0
+
+EVENT_PM_DPTEG_FROM_L3_MEPF 0x42 PMC1
+UMASK_PM_DPTEG_FROM_L3_MEPF 0xE0
+
+EVENT_PM_DPTEG_FROM_L31_MOD 0x44 PMC1
+UMASK_PM_DPTEG_FROM_L31_MOD 0xE0
+
+EVENT_PM_DPTEG_FROM_RL2L3_MOD 0x46 PMC1
+UMASK_PM_DPTEG_FROM_RL2L3_MOD 0xE0
+
+EVENT_PM_DPTEG_FROM_LMEM 0x48 PMC1
+UMASK_PM_DPTEG_FROM_LMEM 0xE0
+
+EVENT_PM_DPTEG_FROM_RL4 0x4A PMC1
+UMASK_PM_DPTEG_FROM_RL4 0xE0
+
+EVENT_PM_DPTEG_FROM_MEMORY 0x4C PMC1
+UMASK_PM_DPTEG_FROM_MEMORY 0xE0
+
+EVENT_PM_DPTEG_FROM_L3_DISP_CONFLICT 0x42 PMC2
+UMASK_PM_DPTEG_FROM_L3_DISP_CONFLICT 0xE0
+
+EVENT_PM_DPTEG_FROM_L31_ECO_SHR 0x44 PMC2
+UMASK_PM_DPTEG_FROM_L31_ECO_SHR 0xE0
+
+EVENT_PM_DPTEG_FROM_L21_SHR 0x46 PMC2
+UMASK_PM_DPTEG_FROM_L21_SHR 0xE0
+
+EVENT_PM_DPTEG_FROM_DL2L3_SHR 0x48 PMC2
+UMASK_PM_DPTEG_FROM_DL2L3_SHR 0xE0
+
+EVENT_PM_DPTEG_FROM_RMEM 0x4A PMC2
+UMASK_PM_DPTEG_FROM_RMEM 0xE0
+
+EVENT_PM_DPTEG_FROM_DL4 0x4C PMC2
+UMASK_PM_DPTEG_FROM_DL4 0xE0
+
+EVENT_PM_DPTEG_FROM_L3 0x42 PMC3
+UMASK_PM_DPTEG_FROM_L3 0xE0
+
+EVENT_PM_DPTEG_FROM_L31_ECO_MOD 0x44 PMC3
+UMASK_PM_DPTEG_FROM_L31_ECO_MOD 0xE0
+
+EVENT_PM_DPTEG_FROM_L21_MOD 0x46 PMC3
+UMASK_PM_DPTEG_FROM_L21_MOD 0xE0
+
+EVENT_PM_DPTEG_FROM_DL2L3_MOD 0x48 PMC3
+UMASK_PM_DPTEG_FROM_DL2L3_MOD 0xE0
+
+EVENT_PM_DPTEG_FROM_OFF_CHIP_CACHE 0x4A PMC3
+UMASK_PM_DPTEG_FROM_OFF_CHIP_CACHE 0xE0
+
+EVENT_PM_DPTEG_FROM_DMEM 0x4C PMC3
+UMASK_PM_DPTEG_FROM_DMEM 0xE0
+
+EVENT_PM_DSLB_MISS 0x16 PMC0
+UMASK_PM_DSLB_MISS 0x00
+
+EVENT_PM_TABLEWALK_CYC 0x26 PMC0
+UMASK_PM_TABLEWALK_CYC 0x00
+
+EVENT_PM_DERAT_MISS_4K 0x56 PMC0
+UMASK_PM_DERAT_MISS_4K 0xC0
+
+EVENT_PM_TLB_HIT 0x54 PMC0
+UMASK_PM_TLB_HIT 0xF0
+
+EVENT_PM_DERAT_MISS_64K 0x54 PMC1
+UMASK_PM_DERAT_MISS_64K 0xC0
+
+EVENT_PM_DTLB_MISS_4K 0x56 PMC1
+UMASK_PM_DTLB_MISS_4K 0xC0
+
+EVENT_PM_TLB_MISS 0x66 PMC1
+UMASK_PM_TLB_MISS 0x00
+
+EVENT_PM_LSU_DERAT_MISS 0xF6 PMC1
+UMASK_PM_LSU_DERAT_MISS 0x00
+
+EVENT_PM_DERAT_MISS_16M_2M 0x54 PMC2
+UMASK_PM_DERAT_MISS_16M_2M 0xC0
+
+EVENT_PM_DSLB_MISS 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_DSLB_MISS 0xD0
+
+EVENT_PM_ISLB_MISS 0xA8 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_ISLB_MISS 0xD8
+
+EVENT_PM_LS0_PTE_TABLEWALK_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS0_PTE_TABLEWALK_CYC 0xE0
+
+EVENT_PM_LS1_PTE_TABLEWALK_CYC 0xBC PMC0|PMC1|PMC2|PMC3
+UMASK_PM_LS1_PTE_TABLEWALK_CYC 0xE8
+
+EVENT_PM_TABLEWALK_CYC_PREF 0x84 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_TABLEWALK_CYC_PREF 0xF8
+
+EVENT_PM_XLATE_HPT_MODE 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_XLATE_HPT_MODE 0xF0
+
+EVENT_PM_XLATE_RADIX_MODE 0x98 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_XLATE_RADIX_MODE 0xF8
+
+EVENT_PM_XLATE_MISS 0x9C PMC0|PMC1|PMC2|PMC3
+UMASK_PM_XLATE_MISS 0xF8
+
+
+
+#################
+# Radix Events
+#################
+
+EVENT_PM_RADIX_PWC_L2_PTE_FROM_L2 0x58 PMC0
+UMASK_PM_RADIX_PWC_L2_PTE_FROM_L2 0xF0
+
+EVENT_PM_RADIX_PWC_L4_PTE_FROM_L2 0x5A PMC0
+UMASK_PM_RADIX_PWC_L4_PTE_FROM_L2 0xF0
+
+EVENT_PM_RADIX_PWC_L3_PDE_FROM_L3 0x5C PMC0
+UMASK_PM_RADIX_PWC_L3_PDE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_L1_PDE_FROM_L2 0x26 PMC1
+UMASK_PM_RADIX_PWC_L1_PDE_FROM_L2 0xD0
+
+EVENT_PM_RADIX_PWC_L2_PDE_FROM_L2 0x28 PMC1
+UMASK_PM_RADIX_PWC_L2_PDE_FROM_L2 0xD0
+
+EVENT_PM_RADIX_PWC_L3_PDE_FROM_L2 0x2A PMC1
+UMASK_PM_RADIX_PWC_L3_PDE_FROM_L2 0xD0
+
+EVENT_PM_RADIX_PWC_L3_PTE_FROM_L2 0x2E PMC1
+UMASK_PM_RADIX_PWC_L3_PTE_FROM_L2 0xD0
+
+EVENT_PM_RADIX_PWC_L4_PTE_FROM_L3MISS 0x54 PMC2
+UMASK_PM_RADIX_PWC_L4_PTE_FROM_L3MISS 0xF0
+
+EVENT_PM_RADIX_PWC_L1_PDE_FROM_L3 0x58 PMC2
+UMASK_PM_RADIX_PWC_L1_PDE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_L2_PDE_FROM_L3 0x5A PMC2
+UMASK_PM_RADIX_PWC_L2_PDE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_L3_PTE_FROM_L3 0x5E PMC2
+UMASK_PM_RADIX_PWC_L3_PTE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_MISS 0x54 PMC3
+UMASK_PM_RADIX_PWC_MISS 0xF0
+
+EVENT_PM_RADIX_PWC_L1_PDE_FROM_L3MISS 0x56 PMC3
+UMASK_PM_RADIX_PWC_L1_PDE_FROM_L3MISS 0xF0
+
+EVENT_PM_RADIX_PWC_L2_PTE_FROM_L3 0x58 PMC3
+UMASK_PM_RADIX_PWC_L2_PTE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_L4_PTE_FROM_L3 0x5A PMC3
+UMASK_PM_RADIX_PWC_L4_PTE_FROM_L3 0xF0
+
+EVENT_PM_RADIX_PWC_L2_PTE_FROM_L3MISS 0x5C PMC3
+UMASK_PM_RADIX_PWC_L2_PTE_FROM_L3MISS 0xF0
+
+EVENT_PM_RADIX_PWC_L3_PTE_FROM_L3MISS 0x5E PMC3
+UMASK_PM_RADIX_PWC_L3_PTE_FROM_L3MISS 0xF0
+
+
+#################
+# L2 Events
+#################
+
+EVENT_PM_L2_LD 0x80 PMC0
+UMASK_PM_L2_LD 0x60
+
+EVENT_PM_L2_ST 0x80 PMC0
+UMASK_PM_L2_ST 0x68
+
+EVENT_PM_L2_LD_MISS 0x80 PMC1
+UMASK_PM_L2_LD_MISS 0x60
+
+EVENT_PM_L2_ST_MISS 0x80 PMC1
+UMASK_PM_L2_ST_MISS 0x68
+
+EVENT_PM_L2_INST 0x80 PMC2
+UMASK_PM_L2_INST 0x60
+
+EVENT_PM_L2_INST_MISS 0x80 PMC2
+UMASK_PM_L2_INST_MISS 0x68
+
+EVENT_PM_L2_DISP_ALL_L2MISS 0x80 PMC3
+UMASK_PM_L2_DISP_ALL_L2MISS 0x60
+
+EVENT_PM_ISIDE_MRU_TOUCH 0x80 PMC3
+UMASK_PM_ISIDE_MRU_TOUCH 0x68
+
+EVENT_PM_L2_CASTOUT_MOD 0x82 PMC0
+UMASK_PM_L2_CASTOUT_MOD 0x60
+
+EVENT_PM_L2_CASTOUT_SHR 0x82 PMC0
+UMASK_PM_L2_CASTOUT_SHR 0x68
+
+EVENT_PM_L2_IC_INV 0x82 PMC1
+UMASK_PM_L2_IC_INV 0x60
+
+EVENT_PM_L2_DC_INV 0x82 PMC1
+UMASK_PM_L2_DC_INV 0x68
+
+EVENT_PM_L2_LD_DISP 0x82 PMC2
+UMASK_PM_L2_LD_DISP 0x60
+
+EVENT_PM_L2_LD_HIT 0x82 PMC2
+UMASK_PM_L2_LD_HIT 0x68
+
+EVENT_PM_L2_ST_DISP 0x82 PMC3
+UMASK_PM_L2_ST_DISP 0x60
+
+EVENT_PM_L2_ST_HIT 0x82 PMC3
+UMASK_PM_L2_ST_HIT 0x68
+
+EVENT_PM_L2_RCLD_DISP 0x84 PMC0
+UMASK_PM_L2_RCLD_DISP 0x60
+
+EVENT_PM_L2_RCLD_DISP_FAIL_ADDR 0x84 PMC0
+UMASK_PM_L2_RCLD_DISP_FAIL_ADDR 0x68
+
+EVENT_PM_L2_RCLD_DISP_FAIL_OTHER 0x84 PMC1
+UMASK_PM_L2_RCLD_DISP_FAIL_OTHER 0x60
+
+EVENT_PM_DSIDE_MRU_TOUCH 0x84 PMC1
+UMASK_PM_DSIDE_MRU_TOUCH 0x68
+
+EVENT_PM_L2_RCST_DISP 0x84 PMC2
+UMASK_PM_L2_RCST_DISP 0x60
+
+EVENT_PM_L2_RCST_DISP_FAIL_ADDR 0x84 PMC2
+UMASK_PM_L2_RCST_DISP_FAIL_ADDR 0x68
+
+EVENT_PM_L2_RCST_DISP_FAIL_OTHER 0x84 PMC3
+UMASK_PM_L2_RCST_DISP_FAIL_OTHER 0x60
+
+EVENT_PM_L2_SN_M_WR_DONE 0x86 PMC0
+UMASK_PM_L2_SN_M_WR_DONE 0x60
+
+EVENT_PM_CO_DISP_FAIL 0x86 PMC0
+UMASK_PM_CO_DISP_FAIL 0x68
+
+EVENT_PM_CO_TM_SC_FOOTPRINT 0x86 PMC1
+UMASK_PM_CO_TM_SC_FOOTPRINT 0x60
+
+EVENT_PM_L2_RC_ST_DONE 0x86 PMC2
+UMASK_PM_L2_RC_ST_DONE 0x60
+
+EVENT_PM_L2_SN_SX_I_DONE 0x86 PMC2
+UMASK_PM_L2_SN_SX_I_DONE 0x68
+
+EVENT_PM_L2_SN_M_RD_DONE 0x86 PMC3
+UMASK_PM_L2_SN_M_RD_DONE 0x60
+
+EVENT_PM_L2_SN_M_WR_DONE 0x86 PMC3
+UMASK_PM_L2_SN_M_WR_DONE 0x68
+
+EVENT_PM_L2_LOC_GUESS_CORRECT 0x88 PMC0
+UMASK_PM_L2_LOC_GUESS_CORRECT 0x60
+
+EVENT_PM_L2_LOC_GUESS_WRONG 0x88 PMC0
+UMASK_PM_L2_LOC_GUESS_WRONG 0x68
+
+EVENT_PM_L2_GRP_GUESS_CORRECT 0x88 PMC1
+UMASK_PM_L2_GRP_GUESS_CORRECT 0x60
+
+EVENT_PM_L2_GRP_GUESS_WRONG 0x88 PMC1
+UMASK_PM_L2_GRP_GUESS_WRONG 0x68
+
+EVENT_PM_L2_SYS_GUESS_CORRECT 0x88 PMC2
+UMASK_PM_L2_SYS_GUESS_CORRECT 0x60
+
+EVENT_PM_L2_SYS_GUESS_WRONG 0x88 PMC2
+UMASK_PM_L2_SYS_GUESS_WRONG 0x68
+
+EVENT_PM_L2_CHIP_PUMP 0x88 PMC3
+UMASK_PM_L2_CHIP_PUMP 0x60
+
+EVENT_PM_L2_GROUP_PUMP 0x88 PMC3
+UMASK_PM_L2_GROUP_PUMP 0x68
+
+EVENT_PM_ISIDE_DISP 0x8A PMC0
+UMASK_PM_ISIDE_DISP 0x68
+
+EVENT_PM_ISIDE_DISP_FAIL_ADDR 0x8A PMC1
+UMASK_PM_ISIDE_DISP_FAIL_ADDR 0x60
+
+EVENT_PM_ISIDE_DISP_FAIL_OTHER 0x8A PMC1
+UMASK_PM_ISIDE_DISP_FAIL_OTHER 0x68
+
+EVENT_PM_L2_RTY_ST 0x8A PMC2
+UMASK_PM_L2_RTY_ST 0x60
+
+EVENT_PM_L2_RTY_LD 0x8A PMC2
+UMASK_PM_L2_RTY_LD 0x68
+
+EVENT_PM_L2_SYS_PUMP 0x8A PMC3
+UMASK_PM_L2_SYS_PUMP 0x68
+
+EVENT_PM_RC0_BUSY 0x8C PMC0|PMC1
+UMASK_PM_RC0_BUSY 0x60
+
+EVENT_PM_RC_USAGE 0x8C PMC0
+UMASK_PM_RC_USAGE 0x68
+
+EVENT_PM_CO_USAGE 0x8C PMC1
+UMASK_PM_CO_USAGE 0x68
+
+EVENT_PM_CO0_BUSY 0x8C PMC2|PMC3
+UMASK_PM_CO0_BUSY 0x60
+
+EVENT_PM_SN_USAGE 0x8C PMC2
+UMASK_PM_SN_USAGE 0x68
+
+EVENT_PM_ST_CAUSED_FAIL 0x8E PMC0
+UMASK_PM_ST_CAUSED_FAIL 0x60
+
+EVENT_PM_TM_LD_CAUSED_FAIL 0x8E PMC0
+UMASK_PM_TM_LD_CAUSED_FAIL 0x68
+
+EVENT_PM_TM_LD_CONF 0x8E PMC1
+UMASK_PM_TM_LD_CONF 0x60
+
+EVENT_PM_TM_FAV_CAUSED_FAIL 0x8E PMC1
+UMASK_PM_TM_FAV_CAUSED_FAIL 0x68
+
+EVENT_PM_TM_ST_CONF 0x8E PMC2
+UMASK_PM_TM_ST_CONF 0x60
+
+EVENT_PM_TM_ST_CAUSED_FAIL 0x8E PMC2
+UMASK_PM_TM_ST_CAUSED_FAIL 0x68
+
+EVENT_PM_TM_CAP_OVERFLOW 0x8E PMC3
+UMASK_PM_TM_CAP_OVERFLOW 0x60
+
+EVENT_PM_SN0_BUSY 0x90 PMC0|PMC1
+UMASK_PM_SN0_BUSY 0x60
+
+EVENT_PM_L1PF_L2MEMACC 0x90 PMC0
+UMASK_PM_L1PF_L2MEMACC 0x68
+
+EVENT_PM_ISIDE_L2MEMACC 0x90 PMC1
+UMASK_PM_ISIDE_L2MEMACC 0x68
+
+EVENT_PM_L2_LD_MISS_128B 0x92 PMC0
+UMASK_PM_L2_LD_MISS_128B 0x60
+
+EVENT_PM_L2_ST_MISS_128B 0x92 PMC0
+UMASK_PM_L2_ST_MISS_128B 0x68
+
+EVENT_PM_L2_LD_MISS_64B 0x92 PMC1
+UMASK_PM_L2_LD_MISS_64B 0x60
+
+EVENT_PM_L2_ST_MISS_64B 0x92 PMC1
+UMASK_PM_L2_ST_MISS_64B 0x60
+
+EVENT_PM_DSIDE_L2MEMACC 0x92 PMC2
+UMASK_PM_DSIDE_L2MEMACC 0x60
+
+EVENT_PM_DSIDE_OTHER_64B_L2MEMACC 0x92 PMC2
+UMASK_PM_DSIDE_OTHER_64B_L2MEMACC 0x68
+
+EVENT_PM_L2_LD_DISP 0x9E PMC0
+UMASK_PM_L2_LD_DISP 0x60
+
+EVENT_PM_L2_ST_DISP 0x9E PMC0
+UMASK_PM_L2_ST_DISP 0x68
+
+EVENT_PM_L2_LD_HIT 0x9E PMC1
+UMASK_PM_L2_LD_HIT 0x60
+
+EVENT_PM_L2_ST_HIT 0x9E PMC1
+UMASK_PM_L2_ST_HIT 0x68
+
+EVENT_PM_L2_INST 0x9E PMC2
+UMASK_PM_L2_INST 0x60
+
+EVENT_PM_L2_RTY_LD 0x9E PMC2
+UMASK_PM_L2_RTY_LD 0x68
+
+EVENT_PM_L2_INST_MISS 0x9E PMC3
+UMASK_PM_L2_INST_MISS 0x60
+
+EVENT_PM_L2_RTY_ST 0x9E PMC3
+UMASK_PM_L2_RTY_ST 0x68
+
+#################
+# L3 Events
+#################
+
+EVENT_PM_L3_PF_MISS_L3 0xA0 PMC0
+UMASK_PM_L3_PF_MISS_L3 0x60
+
+EVENT_PM_L3_CO_MEPF 0xA0 PMC0
+UMASK_PM_L3_CO_MEPF 0x68
+
+EVENT_PM_L3_CO_MEM 0xA0 PMC1
+UMASK_PM_L3_CO_MEM 0x60
+
+EVENT_PM_L3_CO_L31 0xA0 PMC1
+UMASK_PM_L3_CO_L31 0x68
+
+EVENT_PM_L3_PF_ON_CHIP_CACHE 0xA0 PM2
+UMASK_PM_L3_PF_ON_CHIP_CACHE 0x60
+
+EVENT_PM_L3_PF_OFF_CHIP_CACHE 0xA0 PM2
+UMASK_PM_L3_PF_OFF_CHIP_CACHE 0x68
+
+EVENT_PM_L3_PF_ON_CHIP_MEM 0xA0 PM3
+UMASK_PM_L3_PF_ON_CHIP_MEM 0x60
+
+EVENT_PM_L3_PF_OFF_CHIP_MEM 0xA0 PM3
+UMASK_PM_L3_PF_OFF_CHIP_MEM 0x68
+
+EVENT_PM_L3_CI_HIT 0xA2 PMC1
+UMASK_PM_L3_CI_HIT 0x60
+
+EVENT_PM_L3_CI_MISS 0xA2 PMC1
+UMASK_PM_L3_CI_MISS 0x68
+
+EVENT_PM_L3_L2_CO_HIT 0xA2 PMC2
+UMASK_PM_L3_L2_CO_HIT 0x60
+
+EVENT_PM_L3_L2_CO_MISS 0xA2 PMC2
+UMASK_PM_L3_L2_CO_MISS 0x68
+
+EVENT_PM_L3_LAT_CI_HIT 0xA2 PMC3
+UMASK_PM_L3_LAT_CI_HIT 0x60
+
+EVENT_PM_L3_LAT_CI_MISS 0xA2 PMC3
+UMASK_PM_L3_LAT_CI_MISS 0x68
+
+EVENT_PM_L3_HIT 0xA4 PMC0
+UMASK_PM_L3_HIT 0x60
+
+EVENT_PM_L3_MISS 0xA4 PMC0
+UMASK_PM_L3_MISS 0x68
+
+EVENT_PM_L3_LD_HIT 0xA4 PMC1
+UMASK_PM_L3_LD_HIT 0x60
+
+EVENT_PM_L3_LD_MISS 0xA4 PMC1
+UMASK_PM_L3_LD_MISS 0x68
+
+EVENT_PM_L3_CO_LCO 0xA4 PMC2
+UMASK_PM_L3_CO_LCO 0x60
+
+EVENT_PM_L3_CINJ 0xA4 PMC2
+UMASK_PM_L3_CINJ 0x68
+
+EVENT_PM_L3_TRANS_PF 0xA4 PMC3
+UMASK_PM_L3_TRANS_PF 0x68
+
+EVENT_PM_TM_SC_CO 0xA6 PMC0
+UMASK_PM_TM_SC_CO 0x60
+
+EVENT_PM_TM_CAM_OVERFLOW 0xA6 PMC0
+UMASK_PM_TM_CAM_OVERFLOW 0x68
+
+EVENT_PM_NON_TM_RST_SC 0xA6 PMC1
+UMASK_PM_NON_TM_RST_SC 0x60
+
+EVENT_PM_TM_RST_SC 0xA6 PMC1
+UMASK_PM_TM_RST_SC 0x68
+
+EVENT_PM_SNP_TM_HIT_M 0xA6 PMC2
+UMASK_PM_SNP_TM_HIT_M 0x60
+
+EVENT_PM_SNP_TM_HIT_T 0xA6 PMC2
+UMASK_PM_SNP_TM_HIT_T 0x68
+
+EVENT_PM_RD_FORMING_SC 0xA6 PMC3
+UMASK_PM_RD_FORMING_SC 0x60
+
+EVENT_PM_RD_CLEARING_SC 0xA6 PMC3
+UMASK_PM_RD_CLEARING_SC 0x68
+
+EVENT_PM_L3_WI_USAGE 0xA8 PMC0
+UMASK_PM_L3_WI_USAGE 0x68
+
+EVENT_PM_L3_PF_HIT_L3 0xA8 PMC1
+UMASK_PM_L3_PF_HIT_L3 0x60
+
+EVENT_PM_RD_HIT_PF 0xA8 PMC1
+UMASK_PM_RD_HIT_PF 0x68
+
+EVENT_PM_L3_CO 0xA8 PMC2
+UMASK_PM_L3_CO 0x60
+
+EVENT_PM_SN_INVL 0xA8 PMC2
+UMASK_PM_SN_INVL 0x68
+
+EVENT_PM_SN_HIT 0xA8 PMC3
+UMASK_PM_SN_HIT 0x60
+
+EVENT_PM_SN_MISS  0xA8 PMC3
+UMASK_PM_SN_MISS  0x68
+
+EVENT_PM_L3_P0_LCO_NO_DATA 0xAA PMC0
+UMASK_PM_L3_P0_LCO_NO_DATA 0x60
+
+EVENT_PM_L3_P1_LCO_NO_DATA 0xAA PMC0
+UMASK_PM_L3_P1_LCO_NO_DATA 0x68
+
+EVENT_PM_L3_P0_LCO_DATA 0xAA PMC1
+UMASK_PM_L3_P0_LCO_DATA 0x60
+
+EVENT_PM_L3_P1_LCO_DATA 0xAA PMC1
+UMASK_PM_L3_P1_LCO_DATA 0x68
+
+EVENT_PM_L3_P0_CO_MEM 0xAA PMC2
+UMASK_PM_L3_P0_CO_MEM 0x60
+
+EVENT_PM_L3_P1_CO_MEM 0xAA PMC2
+UMASK_PM_L3_P1_CO_MEM 0x68
+
+EVENT_PM_L3_P0_CO_L31 0xAA PMC3
+UMASK_PM_L3_P0_CO_L31 0x60
+
+EVENT_PM_L3_P1_CO_L31 0xAA PMC3
+UMASK_PM_L3_P1_CO_L31 0x68
+
+EVENT_PM_L3_SN_USAGE 0xAC PMC0
+UMASK_PM_L3_SN_USAGE 0x60
+
+EVENT_PM_L3_CI_USAGE 0xAC PMC0
+UMASK_PM_L3_CI_USAGE 0x68
+
+EVENT_PM_L3_PF_USAGE 0xAC PMC1
+UMASK_PM_L3_PF_USAGE 0x60
+
+EVENT_PM_L3_RD_USAGE 0xAC PMC1
+UMASK_PM_L3_RD_USAGE 0x68
+
+EVENT_PM_L3_SN0_BUSY 0xAC PMC2|PMC3
+UMASK_PM_L3_SN0_BUSY 0x60
+
+EVENT_PM_L3_CO0_BUSY 0xAC PMC2|PMC3
+UMASK_PM_L3_CO0_BUSY 0x68
+
+EVENT_PM_L3_P0_PF_RTY 0xAE PMC0
+UMASK_PM_L3_P0_PF_RTY 0x60
+
+EVENT_PM_L3_P1_PF_RTY 0xAE PMC0
+UMASK_PM_L3_P1_PF_RTY 0x68
+
+EVENT_PM_L3_P2_PF_RTY 0xAE PMC1
+UMASK_PM_L3_P2_PF_RTY 0x60
+
+EVENT_PM_L3_P3_PF_RTY 0xAE PMC1
+UMASK_PM_L3_P3_PF_RTY 0x68
+
+EVENT_PM_L3_P0_CO_RTY 0xAE PMC2
+UMASK_PM_L3_P0_CO_RTY 0x60
+
+EVENT_PM_L3_P1_CO_RTY 0xAE PMC2
+UMASK_PM_L3_P1_CO_RTY 0x68
+
+EVENT_PM_L3_P2_CO_RTY 0xAE PMC3
+UMASK_PM_L3_P2_CO_RTY 0x60
+
+EVENT_PM_L3_P3_CO_RTY 0xAE PMC3
+UMASK_PM_L3_P3_CO_RTY 0x68
+
+EVENT_PM_L3_P0_NODE_PUMP 0xB0 PMC0
+UMASK_PM_L3_P0_NODE_PUMP 0x60
+
+EVENT_PM_L3_P1_NODE_PUMP 0xB0 PMC0
+UMASK_PM_L3_P1_NODE_PUMP 0x68
+
+EVENT_PM_L3_P0_GRP_PUMP 0xB0 PMC1
+UMASK_PM_L3_P0_GRP_PUMP 0x60
+
+EVENT_PM_L3_P1_GRP_PUMP 0xB0 PMC1
+UMASK_PM_L3_P1_GRP_PUMP 0x68
+
+EVENT_PM_L3_P0_SYS_PUMP 0xB0 PMC2
+UMASK_PM_L3_P0_SYS_PUMP 0x60
+
+EVENT_PM_L3_P1_SYS_PUMP 0xB0 PMC2
+UMASK_PM_L3_P1_SYS_PUMP 0x68
+
+EVENT_PM_L3_LOC_GUESS_CORRECT 0xB2 PMC0
+UMASK_PM_L3_LOC_GUESS_CORRECT 0x60
+
+EVENT_PM_L3_GRP_GUESS_CORRECT 0xB2 PMC0
+UMASK_PM_L3_GRP_GUESS_CORRECT 0x68
+
+EVENT_PM_L3_SYS_GUESS_CORRECT 0xB2 PMC1
+UMASK_PM_L3_SYS_GUESS_CORRECT 0x60
+
+EVENT_PM_L3_LOC_GUESS_WRONG 0xB2 PMC1
+UMASK_PM_L3_LOC_GUESS_WRONG 0x68
+
+EVENT_PM_L3_GRP_GUESS_WRONG_LOW 0xB2 PMC2
+UMASK_PM_L3_GRP_GUESS_WRONG_LOW 0x60
+
+EVENT_PM_L3_GRP_GUESS_WRONG_HIGH 0xB2 PMC2
+UMASK_PM_L3_GRP_GUESS_WRONG_HIGH 0x68
+
+EVENT_PM_L3_SYS_GUESS_WRONG 0xB2 PMC3
+UMASK_PM_L3_SYS_GUESS_WRONG 0x60
+
+EVENT_PM_L3_P0_LCO_RTY 0xB4 PMC0
+UMASK_PM_L3_P0_LCO_RTY 0x60
+
+EVENT_PM_L3_P1_LCO_RTY 0xB4 PMC0
+UMASK_PM_L3_P1_LCO_RTY 0x68
+
+EVENT_PM_L3_P2_LCO_RTY 0xB4 PMC1
+UMASK_PM_L3_P2_LCO_RTY 0x60
+
+EVENT_PM_L3_P3_LCO_RTY 0xB4 PMC1
+UMASK_PM_L3_P3_LCO_RTY 0x68
+
+EVENT_PM_L3_PF0_BUSY 0xB4 PMC2
+UMASK_PM_L3_PF0_BUSY 0x60
+
+EVENT_PM_L3_RD0_BUSY 0xB4 PMC2
+UMASK_PM_L3_RD0_BUSY 0x68
+
+EVENT_PM_L3_PF0_BUSY 0xB4 PMC3
+UMASK_PM_L3_PF0_BUSY 0x60
+
+EVENT_PM_L3_RD0_BUSY 0xB4 PMC3
+UMASK_PM_L3_RD0_BUSY 0x68
+
+EVENT_PM_L3_WI0_BUSY 0xB6 PMC0|PMC1
+UMASK_PM_L3_WI0_BUSY 0x60
+
+
+#################
+# CPI Stack Events
+#################
+
+EVENT_PM_CMPLU_STALL_LRQ_OTHER 0x04 PMC0
+UMASK_PM_CMPLU_STALL_LRQ_OTHER 0x00
+
+EVENT_PM_CMPLU_STALL_THRD 0x1C PMC0
+UMASK_PM_CMPLU_STALL_THRD 0x00
+
+EVENT_PM_CMPLU_STALL_LARX 0x2A PMC0
+UMASK_PM_CMPLU_STALL_LARX 0x00
+
+EVENT_PM_CMPLU_STALL_LSU_FIN 0x3A PMC0
+UMASK_PM_CMPLU_STALL_LSU_FIN 0x00
+
+EVENT_PM_CMPLU_STALL_DMISS_L2L3 0x3C PMC0
+UMASK_PM_CMPLU_STALL_DMISS_L2L3 0x00
+
+EVENT_PM_CMPLU_STALL_DFLONG 0x5A PMC0
+UMASK_PM_CMPLU_STALL_DFLONG 0x00
+
+EVENT_PM_CMPLU_STALL_DP 0x5C PMC0
+UMASK_PM_CMPLU_STALL_DP 0x00
+
+EVENT_PM_CMPLU_STALL_TEND 0x50 PMC0
+UMASK_PM_CMPLU_STALL_TEND 0xE0
+
+EVENT_PM_CMPLU_STALL_SLB 0x52 PMC0
+UMASK_PM_CMPLU_STALL_SLB 0xE0
+
+EVENT_PM_CMPLU_STALL 0x54 PMC0
+UMASK_PM_CMPLU_STALL 0xE0
+
+EVENT_PM_CMPLU_STALL_FLUSH_ANY_THREAD 0x56 PMC0
+UMASK_PM_CMPLU_STALL_FLUSH_ANY_THREAD 0xE0
+
+EVENT_PM_CMPLU_STALL_ANY_SYNC 0x5A PMC0
+UMASK_PM_CMPLU_STALL_ANY_SYNC 0xE0
+
+EVENT_PM_CMPLU_STALL_NESTED_TBEGIN 0x5C PMC0
+UMASK_PM_CMPLU_STALL_NESTED_TBEGIN 0xE0
+
+EVENT_PM_ICT_NOSLOT_DISP_HELD_TBEGIN 0x64 PMC0
+UMASK_PM_ICT_NOSLOT_DISP_HELD_TBEGIN 0x00
+
+EVENT_PM_ICT_NOSLOT_CYC 0xF8 PMC0
+UMASK_PM_ICT_NOSLOT_CYC 0x00
+
+EVENT_PM_CMPLU_STALL_LSU 0x10 PMC1
+UMASK_PM_CMPLU_STALL_LSU 0xC0
+
+EVENT_PM_CMPLU_STALL_DCACHE_MISS 0x12 PMC1
+UMASK_PM_CMPLU_STALL_DCACHE_MISS 0xC0
+
+EVENT_PM_CMPLU_STALL_STORE_FINISH 0x14 PMC1
+UMASK_PM_CMPLU_STALL_STORE_FINISH 0xC0
+
+EVENT_PM_CMPLU_STALL_PASTE 0x16 PMC1
+UMASK_PM_CMPLU_STALL_PASTE 0xC0
+
+EVENT_PM_CMPLU_STALL_DMISS_L21_L31 0x18 PMC1
+UMASK_PM_CMPLU_STALL_DMISS_L21_L31 0xC0
+
+EVENT_PM_CMPLU_STALL_LHS 0x1A PMC1
+UMASK_PM_CMPLU_STALL_LHS 0xC0
+
+EVENT_PM_CMPLU_STALL_DMISS_REMOTE 0x1C PMC1
+UMASK_PM_CMPLU_STALL_DMISS_REMOTE 0xC0
+
+EVENT_PM_CMPLU_STALL_SYNC_PMU_INT 0x1E PMC1
+UMASK_PM_CMPLU_STALL_SYNC_PMU_INT 0xC0
+
+EVENT_PM_CMPLU_STALL_DFU 0x12 PMC1
+UMASK_PM_CMPLU_STALL_DFU 0xD0
+
+EVENT_PM_CMPLU_STALL_LRQ_FULL 0x14 PMC1
+UMASK_PM_CMPLU_STALL_LRQ_FULL 0xD0
+
+EVENT_PM_CMPLU_STALL_FXU 0x16 PMC1
+UMASK_PM_CMPLU_STALL_FXU 0xD0
+
+EVENT_PM_CMPLU_STALL_EXEC_UNIT 0x18 PMC1
+UMASK_PM_CMPLU_STALL_EXEC_UNIT 0xD0
+
+EVENT_PM_ICT_NOSLOT_IC_MISS 0x1A PMC1
+UMASK_PM_ICT_NOSLOT_IC_MISS 0xD0
+
+EVENT_PM_CMPLU_STALL_STCX 0x1C PMC1
+UMASK_PM_CMPLU_STALL_STCX 0xD0
+
+EVENT_PM_ICT_NOSLOT_DISP_HELD_ISSQ 0x1E PMC1
+UMASK_PM_ICT_NOSLOT_DISP_HELD_ISSQ 0xD0
+
+EVENT_PM_CMPLU_STALL_VFXLONG 0x18 PMC1
+UMASK_PM_CMPLU_STALL_VFXLONG 0xE0
+
+EVENT_PM_CMPLU_STALL_LSU_FLUSH_NEXT 0x1A PMC1
+UMASK_PM_CMPLU_STALL_LSU_FLUSH_NEXT 0xE0
+
+EVENT_PM_CMPLU_STALL_TLBIE 0x1C PMC1
+UMASK_PM_CMPLU_STALL_TLBIE 0xE0
+
+EVENT_PM_CMPLU_STALL_NTC_FLUSH 0x1E PMC1
+UMASK_PM_CMPLU_STALL_NTC_FLUSH 0xE0
+
+EVENT_PM_CMPLU_STALL_EMQ_FULL 0x04 PMC2
+UMASK_PM_CMPLU_STALL_EMQ_FULL 0x00
+
+EVENT_PM_CMPLU_STALL_OTHER_CMPL 0x06 PMC2
+UMASK_PM_CMPLU_STALL_OTHER_CMPL 0x00
+
+EVENT_PM_CMPLU_STALL_PM 0x0A PMC2
+UMASK_PM_CMPLU_STALL_PM 0x00
+
+EVENT_PM_CMPLU_STALL_STORE_FIN_ARB 0x14 PMC2
+UMASK_PM_CMPLU_STALL_STORE_FIN_ARB 0x00
+
+EVENT_PM_CMPLU_STALL_SRQ_FULL 0x16 PMC2
+UMASK_PM_CMPLU_STALL_SRQ_FULL 0x00
+
+EVENT_PM_ICT_NOSLOT_DISP_HELD_HB_FULL 0x18 PMC2
+UMASK_PM_ICT_NOSLOT_DISP_HELD_HB_FULL 0x00
+
+EVENT_PM_CMPLU_STALL_STORE_DATA 0x26 PMC2
+UMASK_PM_CMPLU_STALL_STORE_DATA 0x00
+
+EVENT_PM_CMPLU_STALL_SPEC_FINISH 0x28 PMC2
+UMASK_PM_CMPLU_STALL_SPEC_FINISH 0x00
+
+EVENT_PM_CMPLU_STALL_DMISS_LMEM 0x28 PMC2
+UMASK_PM_CMPLU_STALL_DMISS_LMEM 0x00
+
+EVENT_PM_CMPLU_STALL_EXCEPTION 0x3A PMC2
+UMASK_PM_CMPLU_STALL_EXCEPTION 0x00
+
+EVENT_PM_CMPLU_STALL_NESTED_TEND 0x3C PMC2
+UMASK_PM_CMPLU_STALL_NESTED_TEND 0x00
+
+EVENT_PM_CMPLU_STALL_VDPLONG 0x5A PMC2
+UMASK_PM_CMPLU_STALL_VDPLONG 0xC0
+
+EVENT_PM_CMPLU_STALL_VFXU 0x5C PMC2
+UMASK_PM_CMPLU_STALL_VFXU 0xC0
+
+EVENT_PM_CMPLU_STALL_LSU_MFSPR 0x56 PMC2
+UMASK_PM_CMPLU_STALL_LSU_MFSPR 0x40
+
+EVENT_PM_ICT_NOSLOT_BR_MPRED_ICMISS 0x58 PMC2
+UMASK_PM_ICT_NOSLOT_BR_MPRED_ICMISS 0x40
+
+EVENT_PM_CMPLU_STALL_DPLONG 0x5C PMC2
+UMASK_PM_CMPLU_STALL_DPLONG 0x40
+
+EVENT_PM_ICT_NOSLOT_IC_L3 0x52 PMC2
+UMASK_PM_ICT_NOSLOT_IC_L3 0xE0
+
+EVENT_PM_CMPLU_STALL_STORE_PIPE_ARB 0x10 PMC3
+UMASK_PM_CMPLU_STALL_STORE_PIPE_ARB 0xC0
+
+EVENT_PM_CMPLU_STALL_ERAT_MISS 0x12 PMC3
+UMASK_PM_CMPLU_STALL_ERAT_MISS 0xC0
+
+EVENT_PM_CMPLU_STALL_LMQ_FULL 0x14 PMC3
+UMASK_PM_CMPLU_STALL_LMQ_FULL 0xC0
+
+EVENT_PM_CMPLU_STALL_DMISS_L2L3_CONFLICT 0x16 PMC3
+UMASK_PM_CMPLU_STALL_DMISS_L2L3_CONFLICT 0xC0
+
+EVENT_PM_CMPLU_STALL_DMISS_L3MISS 0x1A PMC3
+UMASK_PM_CMPLU_STALL_DMISS_L3MISS 0xC0
+
+EVENT_PM_CMPLU_STALL_ST_FWD 0x1C PMC3
+UMASK_PM_CMPLU_STALL_ST_FWD 0xC0
+
+EVENT_PM_CMPLU_STALL_CRYPTO 0x1E PMC3
+UMASK_PM_CMPLU_STALL_CRYPTO 0xC0
+
+EVENT_PM_CMPLU_STALL_LOAD_FINISH 0x14 PMC3
+UMASK_PM_CMPLU_STALL_LOAD_FINISH 0xD0
+
+EVENT_PM_CMPLU_STALL_FXLONG 0x16 PMC3
+UMASK_PM_CMPLU_STALL_FXLONG 0xD0
+
+EVENT_PM_CMPLU_STALL_BRU 0x18 PMC3
+UMASK_PM_CMPLU_STALL_BRU 0xD0
+
+EVENT_PM_CMPLU_STALL_EIEIO 0x1A PMC3
+UMASK_PM_CMPLU_STALL_EIEIO 0xD0
+
+EVENT_PM_ICT_NOSLOT_DISP_HELD_SYNC 0x1C PMC3
+UMASK_PM_ICT_NOSLOT_DISP_HELD_SYNC 0xD0
+
+EVENT_PM_ICT_NOSLOT_BR_MPRED 0x1E PMC3
+UMASK_PM_ICT_NOSLOT_BR_MPRED 0xD0
+
+EVENT_PM_ICT_NOSLOT_IC_L3MISS 0x10 PMC3
+UMASK_PM_ICT_NOSLOT_IC_L3MISS 0xE0
+
+EVENT_PM_CMPLU_STALL_MTFPSCR 0x12 PMC3
+UMASK_PM_CMPLU_STALL_MTFPSCR 0xE0
+
+EVENT_PM_CMPLU_STALL_LSAQ_ARB 0x16 PMC3
+UMASK_PM_CMPLU_STALL_LSAQ_ARB 0xE0
+
+EVENT_PM_CMPLU_STALL_NTC_DISP_FIN 0x18 PMC3
+UMASK_PM_CMPLU_STALL_NTC_DISP_FIN 0xE0
+
+EVENT_PM_ICT_NOSLOT_DISP_HELD 0x1A PMC3
+UMASK_PM_ICT_NOSLOT_DISP_HELD 0xE0
+
+EVENT_PM_CMPLU_STALL_VDP 0x5C PMC3
+UMASK_PM_CMPLU_STALL_VDP 0x40
+
+#################
+# Transactional Memory Events
+#################
+
+EVENT_PM_TM_TX_PASS_RUN_INST 0x14 PMC3
+UMASK_PM_TM_TX_PASS_RUN_INST 0xE0
+
+EVENT_PM_TM_PASSED 0x52 PMC1
+UMASK_PM_TM_PASSED 0xE0
+
+EVENT_PM_TM_ABORTS 0x56 PMC2
+UMASK_PM_TM_ABORTS 0x00
+
+EVENT_PM_TM_TRANS_RUN_CYC 0x60 PMC0
+UMASK_PM_TM_TRANS_RUN_CYC 0x00
+
+EVENT_PM_TM_TX_PASS_RUN_CYC 0x12 PMC1
+UMASK_PM_TM_TX_PASS_RUN_CYC 0xE0
+
+EVENT_PM_TM_OUTER_TBEGIN_DISP 0x5E PMC3
+UMASK_PM_TM_OUTER_TBEGIN_DISP 0xE0
+
+
+
+#################
+# PMC Events
+#################
+
+EVENT_PM_SUSPENDED 0x00 PMC0|PMC1|PMC2|PMC3
+UMASK_PM_SUSPENDED 0x00
+
+EVENT_PM_RUN_SPURR 0x08 PMC0
+UMASK_PM_RUN_SPURR 0x00
+
+EVENT_PM_RUN_PURR 0xF4 PMC3
+UMASK_PM_RUN_PURR 0x00
+
+EVENT_PM_PMC0_OVERFLOW 0x10 PMC1
+UMASK_PM_PMC0_OVERFLOW 0x00
+
+EVENT_PM_PMC0_SAVED 0x10 PMC3
+UMASK_PM_PMC0_SAVED 0xD0
+
+EVENT_PM_PMC0_REWIND 0x2C PMC3
+UMASK_PM_PMC0_REWIND 0xD0
+
+EVENT_PM_PMC1_OVERFLOW 0x10 PMC2
+UMASK_PM_PMC1_OVERFLOW 0x00
+
+EVENT_PM_PMC1_SAVED 0x22 PMC0
+UMASK_PM_PMC1_SAVED 0x00
+
+EVENT_PM_PMC1_REWIND 0x20 PMC2
+UMASK_PM_PMC1_REWIND 0x00
+
+EVENT_PM_PMC2_OVERFLOW 0x10 PMC3
+UMASK_PM_PMC2_OVERFLOW 0x00
+
+EVENT_PM_PMC2_SAVED 0x12 PMC3
+UMASK_PM_PMC2_SAVED 0xD0
+
+EVENT_PM_PMC2_REWIND 0x0A PMC0
+UMASK_PM_PMC2_REWIND 0x00
+
+EVENT_PM_PMC3_OVERFLOW 0x10 PMC0
+UMASK_PM_PMC3_OVERFLOW 0x00
+
+EVENT_PM_PMC3_SAVED 0x22 PMC2
+UMASK_PM_PMC3_SAVED 0x00
+
+EVENT_PM_PMC3_REWIND 0x20 PMC0
+UMASK_PM_PMC3_REWIND 0x00
+
+EVENT_PM_PMC4_OVERFLOW 0x24 PMC0
+UMASK_PM_PMC4_OVERFLOW 0x00
+
+EVENT_PM_PMC5_OVERFLOW 0x24 PMC2
+UMASK_PM_PMC5_OVERFLOW 0x00
+
+EVENT_PM_THRESH_EXC_32 0xE6 PMC1
+UMASK_PM_THRESH_EXC_32 0x01
+
+EVENT_PM_THRESH_EXC_64 0xE8 PMC2
+UMASK_PM_THRESH_EXC_64 0x01
+
+EVENT_PM_THRESH_EXC_128 0xEA PMC3
+UMASK_PM_THRESH_EXC_128 0x01
+
+EVENT_PM_THRESH_EXC_256 0xE8 PMC0
+UMASK_PM_THRESH_EXC_256 0x01
+
+EVENT_PM_THRESH_EXC_512 0xE8 PMC1
+UMASK_PM_THRESH_EXC_512 0x01
+
+EVENT_PM_THRESH_EXC_1024 0xEA PMC2
+UMASK_PM_THRESH_EXC_1024 0x01
+
+EVENT_PM_THRESH_EXC_2048 0xAC PMC3
+UMASK_PM_THRESH_EXC_2048 0x01
+
+EVENT_PM_THRESH_EXC_4096 0xE6 PMC0
+UMASK_PM_THRESH_EXC_4096 0x01
+
+EVENT_PM_THRESH_MET 0xEC PMC0
+UMASK_PM_THRESH_MET 0x01
+
+EVENT_PM_THRESH_NOT_MET 0x6E PMC3
+UMASK_PM_THRESH_NOT_MET 0x01
+
+EVENT_PM_THRESH_ACC 0x54 PMC1
+UMASK_PM_THRESH_ACC 0x41
+
+EVENT_PM_CYC 0x1E PMC0|PMC1|PMC2|PMC3
+UMASK_PM_CYC 0x00
+
+EVENT_PM_CYC 0xF0 PMC0
+UMASK_PM_CYC 0x00
+
+EVENT_PM_RUN_CYC 0xF4 PMC1
+UMASK_PM_RUN_CYC 0x00
+
+EVENT_PM_ANY_THRD_RUN_CYC 0xFA PMC0
+UMASK_PM_ANY_THRD_RUN_CYC 0x00
+
+EVENT_PM_THRD_ALL_RUN_CYC 0x0C PMC1
+UMASK_PM_THRD_ALL_RUN_CYC 0x00
+
+EVENT_PM_RUN_CYC_SMT4_MODE 0x6C PMC1
+UMASK_PM_RUN_CYC_SMT4_MODE 0x00
+
+EVENT_PM_HV_CYC 0x0A PMC1
+UMASK_PM_HV_CYC 0x00
+
+
+EVENT_PM_EXT_INT 0xF8 PMC1
+UMASK_PM_EXT_INT 0x00
+
+EVENT_PM_FREQ_UP 0x0C PMC3
+UMASK_PM_FREQ_UP 0x00
+
+EVENT_PM_FREQ_DOWN 0x0C PMC2
+UMASK_PM_FREQ_DOWN 0x00
+
+# Multiply by 4 to obtain the number of SMP interconnect cycles
+EVENT_PM_NEST_REF_CLK 0x6E PMC2
+UMASK_PM_NEST_REF_CLK 0x00
+
+EVENT_PM_TB_BIT_TRANS 0xF8 PMC2
+UMASK_PM_TB_BIT_TRANS 0x00
+
+EVENT_PM_PROBE_NOP_DISP 0x14 PMC3
+UMASK_PM_PROBE_NOP_DISP 0x00
+
+
+
+#################
+# Nest IMC Events
+#################
+
+
+EVENT_PM_MBA0_DRAM_CLK_CYC    0x58 MBOX0
+UMASK_PM_MBA0_DRAM_CLK_CYC    0x04
+
+EVENT_PM_MBA1_DRAM_CLK_CYC    0x70 MBOX1
+UMASK_PM_MBA1_DRAM_CLK_CYC    0x04
+
+EVENT_PM_MBA2_DRAM_CLK_CYC    0x88 MBOX2
+UMASK_PM_MBA2_DRAM_CLK_CYC    0x04
+
+EVENT_PM_MBA3_DRAM_CLK_CYC    0xA0 MBOX3
+UMASK_PM_MBA3_DRAM_CLK_CYC    0x04
+
+EVENT_PM_MBA4_DRAM_CLK_CYC    0x68 MBOX4
+UMASK_PM_MBA4_DRAM_CLK_CYC    0x05
+
+EVENT_PM_MBA5_DRAM_CLK_CYC    0x80 MBOX5
+UMASK_PM_MBA5_DRAM_CLK_CYC    0x05
+
+EVENT_PM_MBA6_DRAM_CLK_CYC    0x98 MBOX6
+UMASK_PM_MBA6_DRAM_CLK_CYC    0x05
+
+EVENT_PM_MBA7_DRAM_CLK_CYC    0xB0 MBOX7
+UMASK_PM_MBA7_DRAM_CLK_CYC    0x05
+
+EVENT_PM_MBA0_READ_BYTES    0x48 MBOX0
+UMASK_PM_MBA0_READ_BYTES    0x04
+
+EVENT_PM_MBA1_READ_BYTES    0x60 MBOX1
+UMASK_PM_MBA1_READ_BYTES    0x04
+
+EVENT_PM_MBA2_READ_BYTES    0x78 MBOX2
+UMASK_PM_MBA2_READ_BYTES    0x04
+
+EVENT_PM_MBA3_READ_BYTES    0x90 MBOX3
+UMASK_PM_MBA3_READ_BYTES    0x04
+
+EVENT_PM_MBA4_READ_BYTES    0x58 MBOX4
+UMASK_PM_MBA4_READ_BYTES    0x05
+
+EVENT_PM_MBA5_READ_BYTES    0x70 MBOX5
+UMASK_PM_MBA5_READ_BYTES    0x05
+
+EVENT_PM_MBA6_READ_BYTES    0x88 MBOX6
+UMASK_PM_MBA6_READ_BYTES    0x05
+
+EVENT_PM_MBA7_READ_BYTES    0xA0 MBOX7
+UMASK_PM_MBA7_READ_BYTES    0x05
+
+EVENT_PM_MBA0_WRITE_BYTES    0x50 MBOX0
+UMASK_PM_MBA0_WRITE_BYTES    0x04
+
+EVENT_PM_MBA1_WRITE_BYTES    0x68 MBOX1
+UMASK_PM_MBA1_WRITE_BYTES    0x04
+
+EVENT_PM_MBA2_WRITE_BYTES    0x80 MBOX2
+UMASK_PM_MBA2_WRITE_BYTES    0x04
+
+EVENT_PM_MBA3_WRITE_BYTES    0x98 MBOX3
+UMASK_PM_MBA3_WRITE_BYTES    0x04
+
+EVENT_PM_MBA4_WRITE_BYTES    0x60 MBOX4
+UMASK_PM_MBA4_WRITE_BYTES    0x05
+
+EVENT_PM_MBA5_WRITE_BYTES    0x78 MBOX5
+UMASK_PM_MBA5_WRITE_BYTES    0x05
+
+EVENT_PM_MBA6_WRITE_BYTES    0x90 MBOX6
+UMASK_PM_MBA6_WRITE_BYTES    0x05
+
+EVENT_PM_MBA7_WRITE_BYTES    0xA8 MBOX7
+UMASK_PM_MBA7_WRITE_BYTES    0x05
+
+#################
+# X Links/SMP Links
+#################
+
+EVENT_PM_XLINK0_IN_EVEN_CYC 0x58 QBOX0
+UMASK_PM_XLINK0_IN_EVEN_CYC 0x01
+
+EVENT_PM_XLINK0_IN_ODD_CYC 0x78 QBOX0
+UMASK_PM_XLINK0_IN_ODD_CYC 0x01
+
+EVENT_PM_XLINK1_OUT_EVEN_CYC 0x18 QBOX1
+UMASK_PM_XLINK1_OUT_EVEN_CYC 0x01
+
+EVENT_PM_XLINK1_OUT_ODD_CYC 0x38 QBOX1
+UMASK_PM_XLINK1_OUT_ODD_CYC 0x01
+
+EVENT_PM_XLINK2_OUT_EVEN_CYC 0x98 QBOX2
+UMASK_PM_XLINK2_OUT_EVEN_CYC 0x01
+
+EVENT_PM_XLINK2_OUT_ODD_CYC 0xB8 QBOX2
+UMASK_PM_XLINK2_OUT_ODD_CYC 0x01
+
+EVENT_PM_XLINK0_IN_EVEN_ANY_RCMD 0x60 QBOX0
+UMASK_PM_XLINK0_IN_EVEN_ANY_RCMD 0x01
+
+EVENT_PM_XLINK0_IN_ODD_ANY_RCMD 0x80 QBOX0
+UMASK_PM_XLINK0_IN_ODD_ANY_RCMD 0x01
+
+EVENT_PM_XLINK1_OUT_EVEN_ANY_RCMD 0x20 QBOX1
+UMASK_PM_XLINK1_OUT_EVEN_ANY_RCMD 0x01
+
+EVENT_PM_XLINK1_OUT_ODD_ANY_RCMD 0x40 QBOX1
+UMASK_PM_XLINK1_OUT_ODD_ANY_RCMD 0x01
+
+EVENT_PM_XLINK2_OUT_EVEN_ANY_RCMD 0xA0 QBOX2
+UMASK_PM_XLINK2_OUT_EVEN_ANY_RCMD 0x01
+
+EVENT_PM_XLINK2_OUT_ODD_ANY_RCMD 0xC0 QBOX2
+UMASK_PM_XLINK2_OUT_ODD_ANY_RCMD 0x01
+
+EVENT_PM_XLINK0_IN_EVEN_DATA_COUNT 0x68 QBOX0
+UMASK_PM_XLINK0_IN_EVEN_DATA_COUNT 0x01
+
+EVENT_PM_XLINK0_IN_ODD_DATA_COUNT 0x88 QBOX0
+UMASK_PM_XLINK0_IN_ODD_DATA_COUNT 0x01
+
+EVENT_PM_XLINK1_OUT_EVEN_DATA_COUNT 0x28 QBOX1
+UMASK_PM_XLINK1_OUT_EVEN_DATA_COUNT 0x01
+
+EVENT_PM_XLINK1_OUT_ODD_DATA_COUNT 0x48 QBOX1
+UMASK_PM_XLINK1_OUT_ODD_DATA_COUNT 0x01
+
+EVENT_PM_XLINK2_OUT_EVEN_DATA_COUNT 0xA8 QBOX2
+UMASK_PM_XLINK2_OUT_EVEN_DATA_COUNT 0x01
+
+EVENT_PM_XLINK2_OUT_ODD_DATA_COUNT 0xC8 QBOX2
+UMASK_PM_XLINK2_OUT_ODD_DATA_COUNT 0x01
+
+EVENT_PM_XLINK0_IN_EVEN_TOTAL_UTIL 0x70 QBOX0
+UMASK_PM_XLINK0_IN_EVEN_TOTAL_UTIL 0x01
+
+EVENT_PM_XLINK0_IN_ODD_TOTAL_UTIL 0x90 QBOX0
+UMASK_PM_XLINK0_IN_ODD_TOTAL_UTIL 0x01
+
+EVENT_PM_XLINK1_OUT_EVEN_TOTAL_UTIL 0x30 QBOX1
+UMASK_PM_XLINK1_OUT_EVEN_TOTAL_UTIL 0x01
+
+EVENT_PM_XLINK1_OUT_ODD_TOTAL_UTIL 0x50 QBOX1
+UMASK_PM_XLINK1_OUT_ODD_TOTAL_UTIL 0x01
+
+EVENT_PM_XLINK2_OUT_EVEN_TOTAL_UTIL 0xB0 QBOX2
+UMASK_PM_XLINK2_OUT_EVEN_TOTAL_UTIL 0x01
+
+EVENT_PM_XLINK2_OUT_ODD_TOTAL_UTIL 0xD0 QBOX2
+UMASK_PM_XLINK2_OUT_ODD_TOTAL_UTIL 0x01
+
+#################
+# PB (SMP interconnect)
+#################
+
+EVENT_PM_PB_EVENT_VG_PUMP 0x8 SBOX
+UMASK_PM_PB_EVENT_VG_PUMP 0x00
+
+EVENT_PM_PB_EVENT_LNS_PUMP 0x10 SBOX
+UMASK_PM_PB_EVENT_LNS_PUMP 0x00
+
+EVENT_PM_PB_EVENT_GROUP_PUMP 0x18 SBOX
+UMASK_PM_PB_EVENT_GROUP_PUMP 0x00
+
+EVENT_PM_PB_EVENT_RNS_PUMP 0x20 SBOX
+UMASK_PM_PB_EVENT_RNS_PUMP 0x00
+
+EVENT_PM_PB_EVENT_RTY_VG_PUMP 0x28 SBOX
+UMASK_PM_PB_EVENT_RTY_VG_PUMP 0x00
+
+EVENT_PM_PB_EVENT_RTY_LNS_PUMP 0x30 SBOX
+UMASK_PM_PB_EVENT_RTY_LNS_PUMP 0x00
+
+EVENT_PM_PB_EVENT_RTY_GROUP_PUMP 0x38 SBOX
+UMASK_PM_PB_EVENT_RTY_GROUP_PUMP 0x00
+
+EVENT_PM_PB_EVENT_RTY_RNS_PUMP 0x40 SBOX
+UMASK_PM_PB_EVENT_RTY_RNS_PUMP 0x00
+
+EVENT_PM_PB_CYC 0x80 SBOX
+UMASK_PM_PB_CYC 0x00
+
+EVENT_PM_PB_VG_PUMP_P01 0x88 SBOX
+UMASK_PM_PB_VG_PUMP_P01 0x00
+
+EVENT_PM_PB_LNS_PUMP_P01 0x90 SBOX
+UMASK_PM_PB_LNS_PUMP_P01 0x00
+
+EVENT_PM_PB_GROUP_PUMP_P01 0x98 SBOX
+UMASK_PM_PB_GROUP_PUMP_P01 0x00
+
+EVENT_PM_PB_RNS_PUMP_P01 0xA0 SBOX
+UMASK_PM_PB_RNS_PUMP_P01 0x00
+
+EVENT_PM_PB_INT_DATA_XFER 0xA8 SBOX
+UMASK_PM_PB_INT_DATA_XFER 0x00
+
+EVENT_PM_PB_EXT_DATA_XFER 0xB0 SBOX
+UMASK_PM_PB_EXT_DATA_XFER 0x00
+
+EVENT_PM_PB_NNS_PUMP 0xB8 SBOX
+UMASK_PM_PB_NNS_PUMP 0x00
+
+EVENT_PM_PB_RTY_NNS_PUMP_P01 0xC0 SBOX
+UMASK_PM_PB_RTY_NNS_PUMP_P01 0x00
+
+EVENT_PM_PB_CYC2 0x00 SBOX
+UMASK_PM_PB_CYC2 0x01
+
+#################
+# MCS (Memory controller synchronous)
+#################
+
+EVENT_PM_MCS01_64B_RD_OR_WR_DISP_PORT01 0xC8 BBOX0
+UMASK_PM_MCS01_64B_RD_OR_WR_DISP_PORT01 0x00
+
+EVENT_PM_MCS23_64B_RD_OR_WR_DISP_PORT01 0x48 BBOX1
+UMASK_PM_MCS23_64B_RD_OR_WR_DISP_PORT01 0x00
+
+EVENT_PM_MCS01_64B_RD_DISP_PORT01 0xD0 BBOX0
+UMASK_PM_MCS01_64B_RD_DISP_PORT01 0x00
+
+EVENT_PM_MCS23_64B_RD_DISP_PORT01 0x50 BBOX1
+UMASK_PM_MCS23_64B_RD_DISP_PORT01 0x00
+
+EVENT_PM_MCS01_64B_WR_DISP_PORT01 0xD8 BBOX0
+UMASK_PM_MCS01_64B_WR_DISP_PORT01 0x00
+
+EVENT_PM_MCS23_64B_WR_DISP_PORT01 0x58 BBOX1
+UMASK_PM_MCS23_64B_WR_DISP_PORT01 0x00
+
+EVENT_PM_MCS01_AMO_OP_DISP_MC23_PORT01 0xE0 BBOX0
+UMASK_PM_MCS01_AMO_OP_DISP_MC23_PORT01 0x00
+
+EVENT_PM_MCS23_AMO_OP_DISP_MC01_PORT01 0x60 BBOX1
+UMASK_PM_MCS23_AMO_OP_DISP_MC01_PORT01 0x00
+
+EVENT_PM_MCS01_64B_RD_OR_WR_DISP_PORT23 0xE8 BBOX0
+UMASK_PM_MCS01_64B_RD_OR_WR_DISP_PORT23 0x00
+
+EVENT_PM_MCS23_64B_RD_OR_WR_DISP_PORT23 0x68 BBOX1
+UMASK_PM_MCS23_64B_RD_OR_WR_DISP_PORT23 0x00
+
+EVENT_PM_MCS01_64B_RD_DISP_PORT23 0xF0 BBOX0
+UMASK_PM_MCS01_64B_RD_DISP_PORT23 0x00
+
+EVENT_PM_MCS23_64B_RD_DISP_PORT23 0x70 BBOX1
+UMASK_PM_MCS23_64B_RD_DISP_PORT23 0x00
+
+EVENT_PM_MCS01_64B_WR_DISP_PORT23 0xF8 BBOX0
+UMASK_PM_MCS01_64B_WR_DISP_PORT23 0x00
+
+EVENT_PM_MCS23_64B_WR_DISP_PORT23 0x78 BBOX1
+UMASK_PM_MCS23_64B_WR_DISP_PORT23 0x00
diff --git a/src/includes/perfmon_sandybridgeEP_events.txt b/src/includes/perfmon_sandybridgeEP_events.txt
index 2e8caa292..4d94818c2 100644
--- a/src/includes/perfmon_sandybridgeEP_events.txt
+++ b/src/includes/perfmon_sandybridgeEP_events.txt
@@ -214,7 +214,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -502,11 +502,7 @@ UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
@@ -1336,19 +1332,19 @@ EVENT_RING_AD_USED                  0x07  PBOX
 UMASK_RING_AD_USED_CW_EVEN          0x01
 UMASK_RING_AD_USED_CW_ODD           0x02
 UMASK_RING_AD_USED_CCW_EVEN         0x04
-UMASK_RING_AD_USED_CCW_EVEN         0x08
+UMASK_RING_AD_USED_CCW_ODD          0x08
 
 EVENT_RING_AK_USED                  0x08  PBOX
 UMASK_RING_AK_USED_CW_EVEN          0x01
 UMASK_RING_AK_USED_CW_ODD           0x02
 UMASK_RING_AK_USED_CCW_EVEN         0x04
-UMASK_RING_AK_USED_CCW_EVEN         0x08
+UMASK_RING_AK_USED_CCW_ODD          0x08
 
 EVENT_RING_BL_USED                  0x09  PBOX
 UMASK_RING_BL_USED_CW_EVEN          0x01
 UMASK_RING_BL_USED_CW_ODD           0x02
 UMASK_RING_BL_USED_CCW_EVEN         0x04
-UMASK_RING_BL_USED_CCW_EVEN         0x08
+UMASK_RING_BL_USED_CCW_ODD          0x08
 
 EVENT_RING_IV_USED                  0x0A  PBOX
 UMASK_RING_IV_USED_ANY              0x0F
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index 2a4bf2d25..912488ead 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -214,7 +214,7 @@ UMASK_IDQ_DSB_UOPS                      0x08
 UMASK_IDQ_MS_DSB_UOPS                   0x10
 UMASK_IDQ_MS_MITE_UOPS                  0x20
 UMASK_IDQ_MS_UOPS                       0x30
-UMASK_IDQ_DSB_UOPS                      0x18
+UMASK_IDQ_DSB_ALL_UOPS                  0x18
 UMASK_IDQ_MITE_ALL_UOPS                 0x24
 UMASK_IDQ_ALL_UOPS                      0x3C
 DEFAULT_OPTIONS_IDQ_MITE_CYCLES         EVENT_OPTION_THRESHOLD=0x1
@@ -502,11 +502,7 @@ UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
 
 EVENT_UOPS_RETIRED                       0xC2  PMC
 UMASK_UOPS_RETIRED_ALL                   0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
diff --git a/src/includes/perfmon_skylake.h b/src/includes/perfmon_skylake.h
index e784bd496..267ded8f0 100644
--- a/src/includes/perfmon_skylake.h
+++ b/src/includes/perfmon_skylake.h
@@ -38,6 +38,7 @@
 #include <limits.h>
 #include <topology.h>
 #include <access.h>
+#include <linux/version.h>
 
 static int perfmon_numCountersSkylake = NUM_COUNTERS_SKYLAKE;
 static int perfmon_numCoreCountersSkylake = NUM_COUNTERS_CORE_SKYLAKE;
@@ -799,8 +800,13 @@ int perfmon_setupCounterThread_skylake(
                     if (flags & 0x1 == 0)
                     {
                         fprintf(stderr, "Warning: Counter PMC3 cannot be used if Restricted Transactional Memory feature is enabled and\n");
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0)
+                        fprintf(stderr, "         bit 0 of register TSX_FORCE_ABORT is 0. As workaround enable\n");
+                        fprintf(stderr, "         allow_tsx_force_abort in /sys/devices/cpu/\n");
+#else
                         fprintf(stderr, "         bit 0 of register TSX_FORCE_ABORT is 0. As workaround write 0x1 to TSX_FORCE_ABORT:\n");
                         fprintf(stderr, "         sudo wrmsr 0x10f 0x1\n");
+#endif
                         eventSet->events[i].type = NOTYPE;
                         continue;
                     }
diff --git a/src/includes/perfmon_skylakeX_events.txt b/src/includes/perfmon_skylakeX_events.txt
index 37703838c..458471775 100644
--- a/src/includes/perfmon_skylakeX_events.txt
+++ b/src/includes/perfmon_skylakeX_events.txt
@@ -146,7 +146,8 @@ UMASK_UOPS_ISSUED_CYCLES_GE_4_UOPS_EXEC 0x01
 DEFAULT_OPTIONS_UOPS_ISSUED_CYCLES_GE_5_UOPS_EXEC EVENT_OPTION_THRESHOLD=0x5
 UMASK_UOPS_ISSUED_CYCLES_GE_5_UOPS_EXEC 0x01
 
-
+EVENT_MEMORY_DISAMBIGUATION_HISTORY_RESET 0x09 PMC
+UMASK_MEMORY_DISAMBIGUATION_HISTORY_RESET 0x01
 
 EVENT_TX_EXEC                           0x5D PMC
 UMASK_TX_EXEC_MISC1                     0x01
@@ -197,16 +198,12 @@ UMASK_UOPS_RETIRED_ALL                   0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
@@ -293,7 +290,6 @@ UMASK_MEM_LOAD_L3_HIT_RETIRED_XSNP_ALL  0x0F
 
 EVENT_MEM_LOAD_L3_MISS_RETIRED              0xD3 PMC
 UMASK_MEM_LOAD_L3_MISS_RETIRED_LOCAL_DRAM   0x01
-UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_DRAM  0x02
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_HITM  0x04
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_FWD   0x08
 UMASK_MEM_LOAD_L3_MISS_RETIRED_REMOTE_ALL   0x0E
@@ -546,11 +542,6 @@ UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD 0x10
 DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 0x10
 
-EVENT_LOCK_CYCLES_CACHE_LOCK            0x63 PMC
-UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION   0x02
-DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
-UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT      0x02
-
 EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0xB2 PMC
 UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0x01
 
@@ -2655,7 +2646,6 @@ UMASK_DATA_REQ_BY_CPU_CFG_READ_PART2     0x40 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_CFG_READ_PART3     0x40 0x03 0x08
 UMASK_DATA_REQ_BY_CPU_CFG_READ_VTD0      0x40 0x03 0x10
 UMASK_DATA_REQ_BY_CPU_CFG_READ_VTD1      0x40 0x03 0x20
-UMASK_DATA_REQ_BY_CPU_CFG_READ_PART0     0x40 0x03 0x01
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART1    0x10 0x03 0x02
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART2    0x10 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_CFG_WRITE_PART3    0x10 0x03 0x08
@@ -2667,7 +2657,6 @@ UMASK_DATA_REQ_BY_CPU_IO_READ_PART2      0x80 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_IO_READ_PART3      0x80 0x03 0x08
 UMASK_DATA_REQ_BY_CPU_IO_READ_VTD0       0x80 0x03 0x10
 UMASK_DATA_REQ_BY_CPU_IO_READ_VTD1       0x80 0x03 0x20
-UMASK_DATA_REQ_BY_CPU_IO_READ_PART0      0x80 0x03 0x01
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART1     0x20 0x03 0x02
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART2     0x20 0x03 0x04
 UMASK_DATA_REQ_BY_CPU_IO_WRITE_PART3     0x20 0x03 0x08
@@ -2782,7 +2771,6 @@ UMASK_TXN_REQ_BY_CPU_CFG_READ_PART2     0x40 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_CFG_READ_PART3     0x40 0x03 0x08
 UMASK_TXN_REQ_BY_CPU_CFG_READ_VTD0      0x40 0x03 0x10
 UMASK_TXN_REQ_BY_CPU_CFG_READ_VTD1      0x40 0x03 0x20
-UMASK_TXN_REQ_BY_CPU_CFG_READ_PART0     0x40 0x03 0x01
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART1    0x10 0x03 0x02
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART2    0x10 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_CFG_WRITE_PART3    0x10 0x03 0x08
@@ -2794,7 +2782,6 @@ UMASK_TXN_REQ_BY_CPU_IO_READ_PART2      0x80 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_IO_READ_PART3      0x80 0x03 0x08
 UMASK_TXN_REQ_BY_CPU_IO_READ_VTD0       0x80 0x03 0x10
 UMASK_TXN_REQ_BY_CPU_IO_READ_VTD1       0x80 0x03 0x20
-UMASK_TXN_REQ_BY_CPU_IO_READ_PART0      0x80 0x03 0x01
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART1     0x20 0x03 0x02
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART2     0x20 0x03 0x04
 UMASK_TXN_REQ_BY_CPU_IO_WRITE_PART3     0x20 0x03 0x08
diff --git a/src/includes/perfmon_skylake_events.txt b/src/includes/perfmon_skylake_events.txt
index f7f58ac9c..64f8249df 100644
--- a/src/includes/perfmon_skylake_events.txt
+++ b/src/includes/perfmon_skylake_events.txt
@@ -197,16 +197,12 @@ UMASK_UOPS_RETIRED_ALL                   0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_ALL              0x01
 UMASK_UOPS_RETIRED_RETIRE_SLOTS          0x02
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1
 UMASK_UOPS_RETIRED_USED_CYCLES           0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_STALL_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_STALL_CYCLES          0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_TOTAL_CYCLES EVENT_OPTION_THRESHOLD=0xA,EVENT_OPTION_INVERT=1
 UMASK_UOPS_RETIRED_TOTAL_CYCLES          0x01
-DEFAULT_OPTIONS_UOPS_RETIRED_CORE_ALL EVENT_OPTION_ANYTHREAD=1
-UMASK_UOPS_RETIRED_CORE_ALL              0x01
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_RETIRE_SLOTS EVENT_OPTION_ANYTHREAD=1
 UMASK_UOPS_RETIRED_CORE_RETIRE_SLOTS     0x02
 DEFAULT_OPTIONS_UOPS_RETIRED_CORE_USED_CYCLES EVENT_OPTION_THRESHOLD=0x1,EVENT_OPTION_ANYTHREAD=1
@@ -522,11 +518,6 @@ UMASK_OFFCORE_REQUESTS_OUTSTANDING_CYCLES_WITH_L3_MISS_DEMAND_DATA_RD 0x10
 DEFAULT_OPTIONS_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 EVENT_OPTION_THRESHOLD=0x6
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_L3_MISS_DEMAND_DATA_RD_GE_6 0x10
 
-EVENT_LOCK_CYCLES_CACHE_LOCK            0x63 PMC
-UMASK_LOCK_CYCLES_CACHE_LOCK_DURATION   0x02
-DEFAULT_OPTIONS_LOCK_CYCLES_CACHE_LOCK_COUNT EVENT_OPTION_EDGE=0x1
-UMASK_LOCK_CYCLES_CACHE_LOCK_COUNT      0x02
-
 EVENT_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0xB2 PMC
 UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL   0x01
 
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index cb34ac342..b3a09ebb2 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -79,9 +79,15 @@ typedef enum {
     EVENT_OPTION_OCCUPANCY_INVERT, /*!< \brief Invert filter for occupancy counting */
     EVENT_OPTION_IN_TRANS, /*!< \brief Count events during transactions */
     EVENT_OPTION_IN_TRANS_ABORT, /*!< \brief Count events that aborted during transactions */
+    EVENT_OPTION_GENERIC_CONFIG, /*!< \brief Configuration bitmask for generic event */
+    EVENT_OPTION_GENERIC_UMASK, /*!< \brief Umask bitmask for generic event */
 #ifdef LIKWID_USE_PERFEVENT
     EVENT_OPTION_PERF_PID, /*!< \brief PID parameter to use in the perf_event_open call */
     EVENT_OPTION_PERF_FLAGS, /*!< \brief FLAGS parameters to use in the perf_event_open call */
+#endif
+#ifdef _ARCH_PPC
+    EVENT_OPTION_PMC,
+    EVENT_OPTION_PMCXSEL,
 #endif
     NUM_EVENT_OPTIONS /*!< \brief Amount of defined options */
 } EventOptionType;
@@ -135,6 +141,8 @@ extern char* eventOptionTypeName[NUM_EVENT_OPTIONS];
 #define EVENT_OPTION_OCCUPANCY_INVERT_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_INVERT)
 #define EVENT_OPTION_IN_TRANS_MASK (1ULL<<EVENT_OPTION_IN_TRANS)
 #define EVENT_OPTION_IN_TRANS_ABORT_MASK (1ULL<<EVENT_OPTION_IN_TRANS_ABORT)
+#define EVENT_OPTION_GENERIC_CONFIG_MASK (1ULL<<EVENT_OPTION_GENERIC_CONFIG)
+#define EVENT_OPTION_GENERIC_UMASK_MASK (1ULL<<EVENT_OPTION_GENERIC_UMASK)
 /** @endcond */
 
 /*! \brief Structure specifying thread to CPU relation
@@ -168,12 +176,12 @@ the event options are hold here.
 */
 typedef struct {
     const char*     name; /*!< \brief Name of the event */
-    const char*     limit; /*!< \brief Valid counters for the event */
-    uint16_t        eventId; /*!< \brief ID of the event */
-    uint8_t         umask; /*!< \brief Most events need to specify a mask to limit counting */
-    uint8_t         cfgBits; /*!< \brief Misc configuration bits */
+    char*           limit; /*!< \brief Valid counters for the event */
+    uint64_t        eventId; /*!< \brief ID of the event */
+    uint64_t        umask; /*!< \brief Most events need to specify a mask to limit counting */
+    uint64_t        cfgBits; /*!< \brief Misc configuration bits */
     uint64_t        cmask; /*!< \brief Misc mask bits */
-    uint8_t         numberOfOptions; /*!< \brief Number of options for the event */
+    uint64_t         numberOfOptions; /*!< \brief Number of options for the event */
     uint64_t        optionMask; /*!< \brief Bitmask for fast check of set options */
     PerfmonEventOption options[NUM_EVENT_OPTIONS]; /*!< \brief List of options */
 } PerfmonEvent;
diff --git a/src/includes/perfmon_zen.h b/src/includes/perfmon_zen.h
index 6ef3dca7a..898b00b30 100644
--- a/src/includes/perfmon_zen.h
+++ b/src/includes/perfmon_zen.h
@@ -41,6 +41,7 @@ int perfmon_init_zen(int cpu_id)
     lock_acquire((int*) &socket_lock[affinity_thread2socket_lookup[cpu_id]], cpu_id);
     lock_acquire((int*) &core_lock[affinity_thread2core_lookup[cpu_id]], cpu_id);
     lock_acquire((int*) &sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &numa_lock[affinity_thread2numa_lookup[cpu_id]], cpu_id);
     return 0;
 }
 
@@ -156,7 +157,7 @@ int k17_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
 
-    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] != cpu_id)
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] != cpu_id)
     {
         return 0;
     }
@@ -164,7 +165,7 @@ int k17_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
     if (flags != currentConfig[cpu_id][index])
     {
-        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_MBOX0);
         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
         currentConfig[cpu_id][index] = flags;
     }
@@ -198,7 +199,7 @@ int perfmon_setupCounterThread_zen(int thread_id, PerfmonEventSet* eventSet)
             case FIXED:
                 fixed_flags |= k17_fixed_setup(cpu_id, index, event);
                 break;
-            case UNCORE:
+            case MBOX0:
                 k17_uncore_setup(cpu_id, index, event);
                 break;
             default:
@@ -224,6 +225,7 @@ int perfmon_startCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     int haveSLock = 0;
     int haveL3Lock = 0;
     int haveCLock = 0;
+    int haveMLock = 0;
     uint64_t flags = 0x0ULL;
     int cpu_id = groupSet->threads[thread_id].processorId;
 
@@ -231,6 +233,10 @@ int perfmon_startCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     {
         haveSLock = 1;
     }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
     if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
     {
         haveL3Lock = 1;
@@ -256,7 +262,7 @@ int perfmon_startCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
             eventSet->events[i].threadCounter[thread_id].startData = 0;
             eventSet->events[i].threadCounter[thread_id].counterData = 0;
             if ((type == PMC) ||
-                ((type == UNCORE) && (haveSLock)) ||
+                ((type == MBOX0) && (haveMLock)) ||
                 ((type == CBOX0) && (haveL3Lock)))
             {
                 VERBOSEPRINTREG(cpu_id, counter, LLU_CAST 0x0ULL, RESET_CTR);
@@ -295,6 +301,7 @@ int perfmon_stopCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     int haveSLock = 0;
     int haveL3Lock = 0;
     int haveCLock = 0;
+    int haveMLock = 0;
     uint64_t counter_result = 0x0ULL;
     int cpu_id = groupSet->threads[thread_id].processorId;
 
@@ -302,6 +309,10 @@ int perfmon_stopCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     {
         haveSLock = 1;
     }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
     if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
     {
         haveL3Lock = 1;
@@ -325,7 +336,7 @@ int perfmon_stopCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
             uint32_t reg = counter_map[index].configRegister;
             uint32_t counter = counter_map[index].counterRegister;
             if ((type == PMC) ||
-                ((type == UNCORE) && (haveSLock)) ||
+                ((type == MBOX0) && (haveMLock)) ||
                 ((type == CBOX0) && (haveL3Lock)))
             {
                 CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
@@ -379,6 +390,7 @@ int perfmon_readCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     int haveSLock = 0;
     int haveL3Lock = 0;
     int haveCLock = 0;
+    int haveMLock = 0;
     uint64_t counter_result = 0x0ULL;
     int cpu_id = groupSet->threads[thread_id].processorId;
 
@@ -386,6 +398,10 @@ int perfmon_readCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
     {
         haveSLock = 1;
     }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
     if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
     {
         haveL3Lock = 1;
@@ -409,7 +425,7 @@ int perfmon_readCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
             uint32_t counter = counter_map[index].counterRegister;
             uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
             if ((type == PMC) ||
-                ((type == UNCORE) && (haveSLock)) ||
+                ((type == MBOX0) && (haveMLock)) ||
                 ((type == CBOX0) && (haveL3Lock)))
             {
                 CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
@@ -456,12 +472,17 @@ int perfmon_finalizeCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveSLock = 0;
     int haveL3Lock = 0;
+    int haveMLock = 0;
     int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
     {
         haveSLock = 1;
     }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
     if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
     {
         haveL3Lock = 1;
@@ -476,7 +497,7 @@ int perfmon_finalizeCountersThread_zen(int thread_id, PerfmonEventSet* eventSet)
         }
         RegisterIndex index = eventSet->events[i].index;
         if ((type == PMC) ||
-            ((type == UNCORE) && (haveSLock)) ||
+            ((type == MBOX0) && (haveMLock)) ||
             ((type == CBOX0) && (haveL3Lock)))
         {
             if (counter_map[index].configRegister != 0x0)
diff --git a/src/includes/perfmon_zen2.h b/src/includes/perfmon_zen2.h
new file mode 100644
index 000000000..02df4ddb2
--- /dev/null
+++ b/src/includes/perfmon_zen2.h
@@ -0,0 +1,534 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_zen2.h
+ *
+ *      Description:  Header file of perfmon module for AMD Family 17 (ZEN2)
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Gruber (tg), thomas.roehl@googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2017 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_zen2_events.h>
+#include <perfmon_zen2_counters.h>
+#include <error.h>
+#include <affinity.h>
+
+static int perfmon_numCountersZen2 = NUM_COUNTERS_ZEN2;
+static int perfmon_numArchEventsZen2 = NUM_ARCH_EVENTS_ZEN2;
+
+int perfmon_init_zen2(int cpu_id)
+{
+    lock_acquire((int*) &socket_lock[affinity_thread2socket_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &core_lock[affinity_thread2core_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &numa_lock[affinity_thread2numa_lookup[cpu_id]], cpu_id);
+    return 0;
+}
+
+int zen2_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
+    cpu_id++;
+    index++;
+    switch (event->eventId)
+    {
+        case 0x1:
+            flags |= (1ULL << AMD_K17_INST_RETIRE_ENABLE_BIT);
+            VERBOSEPRINTREG(cpu_id, 0x00, LLU_CAST flags, SETUP_FIXC0);
+            break;
+        case 0x2:
+        case 0x3:
+            break;
+        default:
+            fprintf(stderr, "Unknown fixed event 0x%X\n", event->eventId);
+            break;
+    }
+    return flags;
+}
+
+int zen2_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
+
+    // per default LIKWID counts in user-space
+    flags |= (1ULL<<AMD_K17_PMC_USER_BIT);
+    flags |= ((event->umask & AMD_K17_PMC_UNIT_MASK) << AMD_K17_PMC_UNIT_SHIFT);
+    flags |= ((event->eventId & AMD_K17_PMC_EVSEL_MASK) << AMD_K17_PMC_EVSEL_SHIFT);
+    flags |= (((event->eventId >> 8) & AMD_K17_PMC_EVSEL_MASK2) << AMD_K17_PMC_EVSEL_SHIFT2);
+
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<AMD_K17_PMC_EDGE_BIT);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<AMD_K17_PMC_KERNEL_BIT);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<AMD_K17_PMC_INVERT_BIT);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & AMD_K17_PMC_THRES_MASK) << AMD_K17_PMC_THRES_SHIFT;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int zen2_cache_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
+    int has_tid = 0;
+    int has_match0 = 0;
+
+    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags |= ((event->umask & AMD_K17_L3_UNIT_MASK) << AMD_K17_L3_UNIT_SHIFT);
+    flags |= ((event->eventId & AMD_K17_L3_EVSEL_MASK) << AMD_K17_L3_EVSEL_SHIFT);
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_TID:
+                    flags |= ((uint64_t)(event->options[j].value & AMD_K17_L3_TID_MASK)) << AMD_K17_L3_TID_SHIFT;
+                    has_tid = 1;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    flags |= ((uint64_t)(event->options[j].value & AMD_K17_L3_SLICE_MASK)) << AMD_K17_L3_SLICE_SHIFT;
+                    has_match0 = 1;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (!has_tid)
+        flags |= AMD_K17_L3_TID_MASK << AMD_K17_L3_TID_SHIFT;
+    if (!has_match0)
+        flags |= AMD_K17_L3_SLICE_MASK << AMD_K17_L3_SLICE_SHIFT;
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int zen2_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
+
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags |= ((event->eventId & AMD_K17_DF_EVSEL_MASK) << AMD_K17_DF_EVSEL_SHIFT);
+    flags |= (((event->eventId >> 8) & AMD_K17_DF_EVSEL_MASK1) << AMD_K17_DF_EVSEL_SHIFT1);
+    flags |= (((event->eventId >> 12) & AMD_K17_DF_EVSEL_MASK2) << AMD_K17_DF_EVSEL_SHIFT2);
+
+    flags |= ((event->umask & AMD_K17_DF_UNIT_MASK) << AMD_K17_DF_UNIT_SHIFT);
+
+
+    if (flags != currentConfig[cpu_id][index])
+    {
+        VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_DF);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+        currentConfig[cpu_id][index] = flags;
+    }
+    return 0;
+}
+
+int perfmon_setupCounterThread_zen2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t fixed_flags = 0x0ULL;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!TESTTYPE(eventSet, type))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        switch (type)
+        {
+            case PMC:
+                zen2_pmc_setup(cpu_id, index, event);
+                break;
+            case CBOX0:
+                zen2_cache_setup(cpu_id, index, event);
+                break;
+            case POWER:
+                break;
+            case FIXED:
+                fixed_flags |= zen2_fixed_setup(cpu_id, index, event);
+                break;
+            case MBOX0:
+                zen2_uncore_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+    }
+    if ((fixed_flags > 0x0ULL))
+    {
+        uint64_t tmp = 0x0ULL;
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_AMD17_HW_CONFIG, &tmp));
+        VERBOSEPRINTREG(cpu_id, MSR_AMD17_HW_CONFIG, LLU_CAST tmp, READ_HW_CONFIG);
+        tmp |= fixed_flags;
+        VERBOSEPRINTREG(cpu_id, MSR_AMD17_HW_CONFIG, LLU_CAST tmp, WRITE_HW_CONFIG)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_AMD17_HW_CONFIG, tmp));
+    }
+    return 0;
+}
+
+
+int perfmon_startCountersThread_zen2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveSLock = 0;
+    int haveL3Lock = 0;
+    int haveCLock = 0;
+    int haveMLock = 0;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
+    {
+        haveL3Lock = 1;
+    }
+    if (core_lock[affinity_thread2core_lookup[cpu_id]] == cpu_id)
+    {
+        haveCLock = 1;
+    }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!TESTTYPE(eventSet, type))
+            {
+                continue;
+            }
+            flags = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            eventSet->events[i].threadCounter[thread_id].startData = 0;
+            eventSet->events[i].threadCounter[thread_id].counterData = 0;
+            if ((type == PMC) ||
+                ((type == MBOX0) && (haveMLock)) ||
+                ((type == CBOX0) && (haveL3Lock)))
+            {
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST 0x0ULL, RESET_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, READ_CTRL);
+                flags |= (1ULL << AMD_K17_ENABLE_BIT);  /* enable flag */
+                VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, START_CTRL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+            }
+            else if (type == POWER)
+            {
+                if (counter == MSR_AMD17_RAPL_PKG_STATUS && (!haveSLock))
+                    continue;
+                if (counter == MSR_AMD17_RAPL_CORE_STATUS && (!haveCLock))
+                    continue;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &flags));
+                eventSet->events[i].threadCounter[thread_id].startData = field64(flags, 0, box_map[type].regWidth);
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST field64(flags, 0, box_map[type].regWidth), START_POWER);
+            }
+            else if (type == FIXED)
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &flags));
+                eventSet->events[i].threadCounter[thread_id].startData = field64(flags, 0, box_map[type].regWidth);
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST field64(flags, 0, box_map[type].regWidth), START_FIXED);
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = eventSet->events[i].threadCounter[thread_id].startData;
+        }
+    }
+    return 0;
+}
+
+int perfmon_stopCountersThread_zen2(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int haveSLock = 0;
+    int haveL3Lock = 0;
+    int haveCLock = 0;
+    int haveMLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
+    {
+        haveL3Lock = 1;
+    }
+    if (core_lock[affinity_thread2core_lookup[cpu_id]] == cpu_id)
+    {
+        haveCLock = 1;
+    }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!TESTTYPE(eventSet, type))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            if ((type == PMC) ||
+                ((type == MBOX0) && (haveMLock)) ||
+                ((type == CBOX0) && (haveL3Lock)))
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags &= ~(1ULL<<AMD_K17_ENABLE_BIT);  /* clear enable flag */
+                VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, STOP_CTRL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                VERBOSEPRINTREG(cpu_id, reg, LLU_CAST counter_result, READ_CTR);
+                if (field64(counter_result, 0, box_map[type].regWidth) < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    VERBOSEPRINTREG(cpu_id, reg, LLU_CAST counter_result, OVERFLOW);
+                }
+            }
+            else if (type == POWER)
+            {
+                if (counter == MSR_AMD17_RAPL_PKG_STATUS && (!haveSLock))
+                    continue;
+                if (counter == MSR_AMD17_RAPL_CORE_STATUS && (!haveCLock))
+                    continue;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, OVERFLOW_POWER)
+                }
+
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, STOP_POWER);
+            }
+            else if (type == FIXED)
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, OVERFLOW_FIXED)
+                }
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, STOP_FIXED);
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
+        }
+    }
+    return 0;
+}
+
+
+int perfmon_readCountersThread_zen2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveSLock = 0;
+    int haveL3Lock = 0;
+    int haveCLock = 0;
+    int haveMLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
+    {
+        haveL3Lock = 1;
+    }
+    if (core_lock[affinity_thread2core_lookup[cpu_id]] == cpu_id)
+    {
+        haveCLock = 1;
+    }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!TESTTYPE(eventSet, type))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            if ((type == PMC) ||
+                ((type == MBOX0) && (haveMLock)) ||
+                ((type == CBOX0) && (haveL3Lock)))
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_CTR);
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                }
+                *current = field64(counter_result, 0, box_map[type].regWidth);
+            }
+            else if (type == POWER)
+            {
+                if (counter == MSR_AMD17_RAPL_PKG_STATUS && (!haveSLock))
+                    continue;
+                if (counter == MSR_AMD17_RAPL_CORE_STATUS && (!haveCLock))
+                    continue;
+                CHECK_POWER_READ_ERROR(power_read(cpu_id, counter, (uint32_t*)&counter_result));
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_POWER)
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, OVERFLOW_POWER)
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                }
+                *current = field64(counter_result, 0, box_map[type].regWidth);
+            }
+            else if (type == FIXED)
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, OVERFLOW_FIXED)
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                }
+                *current = field64(counter_result, 0, box_map[type].regWidth);
+            }
+        }
+    }
+    return 0;
+}
+
+
+int perfmon_finalizeCountersThread_zen2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveSLock = 0;
+    int haveMLock = 0;
+    int haveL3Lock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_thread2socket_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (sharedl3_lock[affinity_thread2sharedl3_lookup[cpu_id]] == cpu_id)
+    {
+        haveL3Lock = 1;
+    }
+    if (numa_lock[affinity_thread2numa_lookup[cpu_id]] == cpu_id)
+    {
+        haveMLock = 1;
+    }
+
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!TESTTYPE(eventSet, type))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        if ((type == PMC) ||
+            ((type == MBOX0) && (haveMLock)) ||
+            ((type == CBOX0) && (haveL3Lock)))
+        {
+            if (counter_map[index].configRegister != 0x0)
+            {
+                VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, 0x0ULL, CLEAR_CTRL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+            }
+            if (counter_map[index].counterRegister != 0x0)
+            {
+                VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        }
+        else if (type == FIXED)
+        {
+            uint64_t tmp = 0x0ULL;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_AMD17_HW_CONFIG, &tmp));
+            if (tmp & (1ULL << AMD_K17_INST_RETIRE_ENABLE_BIT))
+            {
+                tmp &= ~(1ULL << AMD_K17_INST_RETIRE_ENABLE_BIT);
+            }
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_AMD17_HW_CONFIG, tmp));
+        }
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_zen2_counters.h b/src/includes/perfmon_zen2_counters.h
new file mode 100644
index 000000000..d8b68e7cc
--- /dev/null
+++ b/src/includes/perfmon_zen2_counters.h
@@ -0,0 +1,89 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_zen2_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for AMD Family 17 (Zen2)
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2017 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_ZEN2 21
+#define NUM_COUNTERS_CORE_ZEN2 9
+
+#define AMD_K17_DF_EVSEL_SHIFT  0
+#define AMD_K17_DF_EVSEL_MASK   0xFFULL
+#define AMD_K17_DF_EVSEL_SHIFT1 32
+#define AMD_K17_DF_EVSEL_MASK1  0xFULL
+#define AMD_K17_DF_EVSEL_SHIFT2 59
+#define AMD_K17_DF_EVSEL_MASK2  0x3ULL
+#define AMD_K17_DF_UNIT_SHIFT   8
+#define AMD_K17_DF_UNIT_MASK    0xFFULL
+
+#define ZEN2_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define ZEN2_VALID_OPTIONS_L3 EVENT_OPTION_TID_MASK|EVENT_OPTION_MATCH0_MASK
+
+static RegisterMap zen2_counter_map[NUM_COUNTERS_ZEN2] = {
+    /* Fixed counters */
+    {"FIXC0", PMC0, FIXED, MSR_AMD17_HW_CONFIG, MSR_AMD17_RO_INST_RETIRED_CTR, 0, 0, 0},
+    {"FIXC1", PMC1, FIXED, 0, MSR_AMD17_RO_APERF, 0, 0, 0},
+    {"FIXC2", PMC2, FIXED, 0, MSR_AMD17_RO_MPERF, 0, 0, 0},
+    /* Core counters */
+    {"PMC0",PMC3, PMC, MSR_AMD17_PERFEVTSEL0, MSR_AMD17_PMC0, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    {"PMC1",PMC4, PMC, MSR_AMD17_PERFEVTSEL1, MSR_AMD17_PMC1, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    {"PMC2",PMC5, PMC, MSR_AMD17_PERFEVTSEL2, MSR_AMD17_PMC2, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    {"PMC3",PMC6, PMC, MSR_AMD17_PERFEVTSEL3, MSR_AMD17_PMC3, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    {"PMC4",PMC7, PMC, MSR_AMD17_2_PERFEVTSEL4, MSR_AMD17_2_PMC4, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    {"PMC5",PMC8, PMC, MSR_AMD17_2_PERFEVTSEL5, MSR_AMD17_2_PMC5, 0, 0, ZEN2_VALID_OPTIONS_PMC},
+    /* L3 cache counters */
+    {"CPMC0",PMC9, CBOX0, MSR_AMD17_L3_PERFEVTSEL0, MSR_AMD17_L3_PMC0, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    {"CPMC1",PMC10, CBOX0, MSR_AMD17_L3_PERFEVTSEL1, MSR_AMD17_L3_PMC1, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    {"CPMC2",PMC11, CBOX0, MSR_AMD17_L3_PERFEVTSEL2, MSR_AMD17_L3_PMC2, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    {"CPMC3",PMC12, CBOX0, MSR_AMD17_L3_PERFEVTSEL3, MSR_AMD17_L3_PMC3, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    {"CPMC4",PMC13, CBOX0, MSR_AMD17_L3_PERFEVTSEL4, MSR_AMD17_L3_PMC4, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    {"CPMC5",PMC14, CBOX0, MSR_AMD17_L3_PERFEVTSEL5, MSR_AMD17_L3_PMC5, 0, 0, ZEN2_VALID_OPTIONS_L3},
+    /* Energy counters */
+    {"PWR0", PMC15, POWER, 0, MSR_AMD17_RAPL_CORE_STATUS, 0, 0},
+    {"PWR1", PMC16, POWER, 0, MSR_AMD17_RAPL_PKG_STATUS, 0, 0},
+    /* Data fabric counters */
+    {"DFC0",PMC17, MBOX0, MSR_AMD17_2_DF_PERFEVTSEL0, MSR_AMD17_2_DF_PMC0, 0, 0},
+    {"DFC1",PMC18, MBOX0, MSR_AMD17_2_DF_PERFEVTSEL1, MSR_AMD17_2_DF_PMC1, 0, 0},
+    {"DFC2",PMC19, MBOX0, MSR_AMD17_2_DF_PERFEVTSEL2, MSR_AMD17_2_DF_PMC2, 0, 0},
+    {"DFC3",PMC20, MBOX0, MSR_AMD17_2_DF_PERFEVTSEL3, MSR_AMD17_2_DF_PMC3, 0, 0},
+};
+
+static BoxMap zen2_box_map[NUM_UNITS] = {
+    [FIXED] = {0, 0, 0, 0, 0, 0, 64},
+    [PMC] = {0, 0, 0, 0, 0, 0, 48},
+    [CBOX0] = {0, 0, 0, 0, 0, 0, 48},
+    [MBOX0] = {0, 0, 0, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+};
+
+static char* zen2_translate_types[NUM_UNITS] = {
+    [FIXED] = "/sys/bus/event_source/devices/cpu",
+    [PMC] = "/sys/bus/event_source/devices/cpu",
+    [POWER] = "/sys/bus/event_source/devices/power",
+    [CBOX0] = "/sys/bus/event_source/devices/amd_l3",
+    [MBOX0] = "/sys/bus/event_source/devices/amd_df",
+};
diff --git a/src/includes/perfmon_zen2_events.txt b/src/includes/perfmon_zen2_events.txt
new file mode 100644
index 000000000..ea6751aa7
--- /dev/null
+++ b/src/includes/perfmon_zen2_events.txt
@@ -0,0 +1,431 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_zen2_events.txt
+#
+#      Description:  Event list for AMD Zen (Gen2)
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl@googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2017 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+# Fixed Events
+EVENT_INST_RETIRED_ANY                      0x01 FIXC0
+UMASK_INST_RETIRED_ANY                      0x00
+
+EVENT_ACTUAL_CPU_CLOCK                      0x02 FIXC1
+UMASK_ACTUAL_CPU_CLOCK                      0x00
+
+EVENT_APERF                                 0x02 FIXC1
+UMASK_APERF                                 0x00
+
+EVENT_MAX_CPU_CLOCK                         0x03 FIXC2
+UMASK_MAX_CPU_CLOCK                         0x00
+
+EVENT_MPERF                                 0x03 FIXC2
+UMASK_MPERF                                 0x00
+
+# Core-local Events
+
+# FP events
+
+EVENT_FPU_PIPE_ASSIGNMENT                              0x00    PMC
+UMASK_FPU_PIPE_ASSIGNMENT_UOPS_PIPE_0                  0x01
+UMASK_FPU_PIPE_ASSIGNMENT_UOPS_PIPE_1                  0x02
+UMASK_FPU_PIPE_ASSIGNMENT_UOPS_PIPE_2                  0x04
+UMASK_FPU_PIPE_ASSIGNMENT_UOPS_PIPE_3                  0x08
+UMASK_FPU_PIPE_ASSIGNMENT_UOPS_PIPE_ALL                0x0F
+UMASK_FPU_PIPE_ASSIGNMENT_MULTI_PIPE_UOPS_PIPE_0       0x10
+UMASK_FPU_PIPE_ASSIGNMENT_MULTI_PIPE_UOPS_PIPE_1       0x20
+UMASK_FPU_PIPE_ASSIGNMENT_MULTI_PIPE_UOPS_PIPE_2       0x40
+UMASK_FPU_PIPE_ASSIGNMENT_MULTI_PIPE_UOPS_PIPE_3       0x80
+UMASK_FPU_PIPE_ASSIGNMENT_MULTI_PIPE_UOPS_PIPE_ALL     0xF0
+
+
+EVENT_FP_SCHEDULER_EMPTY                          0x01    PMC
+UMASK_FP_SCHEDULER_EMPTY                          0x00
+
+EVENT_RETIRED_X87_FLOPS                           0x02     PMC
+UMASK_RETIRED_X87_FLOPS_ADD_SUB                   0x01
+UMASK_RETIRED_X87_FLOPS_MULT                      0x02
+UMASK_RETIRED_X87_FLOPS_DIV                       0x04
+UMASK_RETIRED_X87_FLOPS_ALL                       0x07
+
+EVENT_RETIRED_SSE_AVX_FLOPS                     0x03    PMC
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_SUB      0x01
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_MULT         0x02
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_DIV          0x04
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_ADD_MULT_DIV 0x07
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_FMA          0x08
+UMASK_RETIRED_SSE_AVX_FLOPS_SINGLE_ALL          0x0F
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_SUB      0x10
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_MULT         0x20
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_DIV          0x40
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_ADD_MULT_DIV 0x70
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_FMA          0x80
+UMASK_RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL          0xF0
+UMASK_RETIRED_SSE_AVX_FLOPS_ALL                 0xFF
+
+EVENT_MOVE_ELIMINATION                         0x04 PMC
+UMASK_MOVE_ELIMINATION_SSE_MOVES               0x01
+UMASK_MOVE_ELIMINATION_SSE_MOVE_ELIMS          0x02
+
+EVENT_SCALAR_OP_OPTIMIZATIONS                    0x04 PMC
+UMASK_SCALAR_OP_OPTIMIZATIONS_POTENTIAL          0x04
+UMASK_SCALAR_OP_OPTIMIZATIONS_DONE               0x08
+
+EVENT_RETIRED_SERIALIZING_OPS                       0x05    PMC
+UMASK_RETIRED_SERIALIZING_OPS_SSE_BOTTOM            0x00
+UMASK_RETIRED_SERIALIZING_OPS_SSE_CONTROL           0x01
+UMASK_RETIRED_SERIALIZING_OPS_X87_BOTTOM            0x02
+UMASK_RETIRED_SERIALIZING_OPS_X87_CONTROL           0x04
+
+# LS Events
+
+EVENT_RETIRED_LOCK_INSTR                0x25 PMC
+UMASK_RETIRED_LOCK_INSTR                0x00
+
+EVENT_RETIRED_CLFLUSH                   0x26 PMC
+UMASK_RETIRED_CLFLUSH                   0x00
+
+EVENT_RETIRED_CPUID                     0x27 PMC
+UMASK_RETIRED_CPUID                     0x00
+
+EVENT_LS_DISPATCH                      0x29 PMC
+UMASK_LS_DISPATCH_LOADS                0x01
+UMASK_LS_DISPATCH_STORES               0x02
+UMASK_LS_DISPATCH_LOAD_OP_STORES       0x04
+
+EVENT_SMIS_RECEIVED                    0x2B PMC
+UMASK_SMIS_RECEIVED                    0x00
+
+EVENT_INTERRUPTS_TAKEN                 0x2C PMC
+UMASK_INTERRUPTS_TAKEN                 0x00
+
+EVENT_ST_TO_LD_FWD                     0x35 PMC
+UMASK_ST_TO_LD_FWD                     0x00
+
+EVENT_ST_COMMIT_CANCELS                0x37 PMC
+UMASK_ST_COMMIT_CANCELS                0x01
+
+EVENT_DATA_CACHE_ACCESSES              0x40 PMC
+UMASK_DATA_CACHE_ACCESSES              0x00
+
+
+EVENT_DC_MISS_BY_TYPE                  0x41 PMC
+UMASK_DC_MISS_BY_TYPE_LOADS            0x01
+UMASK_DC_MISS_BY_TYPE_STORES           0x02
+UMASK_DC_MISS_BY_TYPE_HW_PF            0x08
+
+EVENT_DATA_CACHE_REFILLS                            0x43 PMC
+UMASK_DATA_CACHE_REFILLS_LOCAL_L2                   0x01
+UMASK_DATA_CACHE_REFILLS_LOCAL_CACHE                0x02
+UMASK_DATA_CACHE_REFILLS_LOCAL_DRAM                 0x08
+UMASK_DATA_CACHE_REFILLS_REMOTE_CACHE               0x10
+UMASK_DATA_CACHE_REFILLS_REMOTE_DRAM                0x40
+UMASK_DATA_CACHE_REFILLS_LOCAL                      0x0B
+UMASK_DATA_CACHE_REFILLS_REMOTE                     0x50
+UMASK_DATA_CACHE_REFILLS_ALL                        0x5B
+
+EVENT_L1_DTLB_MISS                     0x45 PMC
+UMASK_L1_DTLB_MISS_4K_L2_HIT           0x01
+UMASK_L1_DTLB_MISS_32K_L2_HIT          0x02
+UMASK_L1_DTLB_MISS_2M_L2_HIT           0x04
+UMASK_L1_DTLB_MISS_1G_L2_HIT           0x08
+UMASK_L1_DTLB_MISS_ANY_L2_HIT          0x0F
+UMASK_L1_DTLB_MISS_4K_L2_MISS          0x10
+UMASK_L1_DTLB_MISS_32K_L2_MISS         0x20
+UMASK_L1_DTLB_MISS_2M_L2_MISS          0x40
+UMASK_L1_DTLB_MISS_1G_L2_MISS          0x80
+UMASK_L1_DTLB_MISS_ANY_L2_MISS         0xF0
+
+EVENT_TABLEWALKER_ALLOC                0x46 PMC
+UMASK_TABLEWALKER_ALLOC_DSIDE0         0x01
+UMASK_TABLEWALKER_ALLOC_DSIDE1         0x02
+UMASK_TABLEWALKER_ALLOC_ISIDE0         0x04
+UMASK_TABLEWALKER_ALLOC_ISIDE1         0x08
+
+EVENT_MISALIGNED_LOADS                      0x47    PMC
+UMASK_MISALIGNED_LOADS                      0x00
+
+EVENT_PREF_INSTR_DISPATCHED                 0x4B    PMC
+UMASK_PREF_INSTR_DISPATCHED                 0x00
+
+EVENT_INEFFECTIVE_SW_PREF                    0x52    PMC
+UMASK_INEFFECTIVE_SW_PREF_DATA_CACHE_HIT     0x01
+UMASK_INEFFECTIVE_SW_PREF_MAB_MATCH          0x02
+
+EVENT_SWPREF_DATA_CACHE_FILLS                   0x59 PMC
+UMASK_SWPREF_DATA_CACHE_FILLS_LOCAL_L2          0x01
+UMASK_SWPREF_DATA_CACHE_FILLS_LOCAL_CACHE       0x02
+UMASK_SWPREF_DATA_CACHE_FILLS_LOCAL_DRAM        0x08
+UMASK_SWPREF_DATA_CACHE_FILLS_REMOTE_CACHE      0x10
+UMASK_SWPREF_DATA_CACHE_FILLS_REMOTE_DRAM       0x40
+UMASK_SWPREF_DATA_CACHE_FILLS_LOCAL                0x0B
+UMASK_SWPREF_DATA_CACHE_FILLS_REMOTE               0x50
+UMASK_SWPREF_DATA_CACHE_FILLS_ALL                  0x5B
+
+EVENT_HWPREF_DATA_CACHE_FILLS                   0x5A PMC
+UMASK_HWPREF_DATA_CACHE_FILLS_LOCAL_L2          0x01
+UMASK_HWPREF_DATA_CACHE_FILLS_LOCAL_CACHE       0x02
+UMASK_HWPREF_DATA_CACHE_FILLS_LOCAL_DRAM        0x08
+UMASK_HWPREF_DATA_CACHE_FILLS_REMOTE_CACHE      0x10
+UMASK_HWPREF_DATA_CACHE_FILLS_REMOTE_DRAM       0x40
+UMASK_HWPREF_DATA_CACHE_FILLS_LOCAL                0x0B
+UMASK_HWPREF_DATA_CACHE_FILLS_REMOTE               0x50
+UMASK_HWPREF_DATA_CACHE_FILLS_ALL                  0x5B
+
+EVENT_TABLEWALKER_DATA_CACHE_FILLS                   0x5B PMC
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_LOCAL_L2          0x01
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_LOCAL_CACHE       0x02
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_LOCAL_DRAM        0x08
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_REMOTE_CACHE      0x10
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_REMOTE_DRAM       0x40
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_LOCAL             0x0B
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_REMOTE            0x50
+UMASK_TABLEWALKER_DATA_CACHE_FILLS_ALL               0x5B
+
+EVENT_CPU_CLOCKS_UNHALTED                    0x76 PMC
+UMASK_CPU_CLOCKS_UNHALTED                    0x00
+
+EVENT_TLB_FLUSHES                            0x78 PMC
+UMASK_TLB_FLUSHES                            0x00
+
+#here IC and BP events
+
+EVENT_ICACHE_FETCHES                        0x80     PMC
+UMASK_ICACHE_FETCHES                        0x00
+
+EVENT_ICACHE_MISSES                         0x81     PMC
+UMASK_ICACHE_MISSES                         0x00
+
+EVENT_ICACHE_L2_REFILLS                     0x82     PMC
+UMASK_ICACHE_L2_REFILLS                     0x00
+
+EVENT_ICACHE_SYSTEM_REFILLS                 0x83     PMC
+UMASK_ICACHE_SYSTEM_REFILLS                 0x00
+
+EVENT_L1_ITLB_MISS_L2_ITLB_HIT              0x84     PMC
+UMASK_L1_ITLB_MISS_L2_ITLB_HIT              0x00
+
+EVENT_L1_ITLB_MISS_L2_ITLB_MISS             0x85     PMC
+UMASK_L1_ITLB_MISS_L2_ITLB_MISS             0x00
+
+EVENT_PIPELINE_RESTART_DUE_INSTR_STREAM_PROBE    0x86 PMC
+UMASK_PIPELINE_RESTART_DUE_INSTR_STREAM_PROBE    0x00
+
+EVENT_L1_BTB_CORRECTION                    0x8A PMC
+UMASK_L1_BTB_CORRECTION                    0x00
+
+EVENT_L2_BTB_CORRECTION                    0x8B PMC
+UMASK_L2_BTB_CORRECTION                    0x00
+
+EVENT_ICACHE_LINES_INVALIDATED             0x8C PMC
+UMASK_ICACHE_LINES_INVALIDATED_FILL        0x01
+UMASK_ICACHE_LINES_INVALIDATED_L2_PROBE    0x02
+
+EVENT_DEC_OVERRIDE_BTB                     0x91 PMC
+UMASK_DEC_OVERRIDE_BTB                     0x00
+
+EVENT_ITLB_RELOADS                         0x99     PMC
+UMASK_ITLB_RELOADS                         0x00
+
+EVENT_OC_MODE_SWITCH                      0x28A PMC
+UMASK_OC_MODE_SWITCH_IC_OC                0x01
+UMASK_OC_MODE_SWITCH_OC_IC                0x02
+
+# DE Events
+
+EVENT_UOPS_DISP                           0xAA PMC
+UMASK_UOPS_DISP_FROM_DEC                  0x01
+UMASK_UOPS_DISP_FROM_OPCACHE              0x02
+
+EVENT_DYN_TOKENS_DISP_STALL_CYCLES0                          0xAF PMC
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALSQ1_TOKEN_STALL        0x01
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALSQ2_TOKEN_STALL        0x02
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALSQ3_TOKEN_STALL        0x04
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALSQ3_0_TOKEN_STALL      0x08
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALSQ_ANY_TOKEN_STALL     0x0F
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_ALU_TOKEN_STALL          0x10
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_AGSQ_TOKEN_STALL         0x20
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES0_RETIRE_TOKEN_STALL       0x40
+
+EVENT_DYN_TOKENS_DISP_STALL_CYCLES1                           0xAE PMC
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_INT_REG_FILE_STALL        0x01
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_LD_QUEUE_STALL            0x02
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_ST_QUEUE_STALL            0x04
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_INT_SCHED_MISC_STALL      0x08
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_TAKEN_BRANCH_BUFFER_STALL 0x10
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_FP_REG_FILE_STALL         0x20
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_FP_SCHED_STALL            0x40
+UMASK_DYN_TOKENS_DISP_STALL_CYCLES1_FP_MISC_UNAVAIL           0x80
+
+# EX (SC) Events
+
+EVENT_RETIRED_INSTRUCTIONS                    0xC0     PMC
+UMASK_RETIRED_INSTRUCTIONS                    0x00
+
+EVENT_RETIRED_UOPS                            0xC1     PMC
+UMASK_RETIRED_UOPS                            0x00
+
+EVENT_RETIRED_BRANCH_INSTR                    0xC2     PMC
+UMASK_RETIRED_BRANCH_INSTR                    0x00
+
+EVENT_RETIRED_MISP_BRANCH_INSTR               0xC3     PMC
+UMASK_RETIRED_MISP_BRANCH_INSTR               0x00
+
+EVENT_RETIRED_TAKEN_BRANCH_INSTR              0xC4     PMC
+UMASK_RETIRED_TAKEN_BRANCH_INSTR              0x00
+
+EVENT_RETIRED_TAKEN_MISP_BRANCH_INSTR         0xC5     PMC
+UMASK_RETIRED_TAKEN_MISP_BRANCH_INSTR         0x00
+
+EVENT_RETIRED_FAR_CONTROL_TRANSFERS           0xC6     PMC
+UMASK_RETIRED_FAR_CONTROL_TRANSFERS           0x00
+
+EVENT_RETIRED_NEAR_RETURNS                    0xC8     PMC
+UMASK_RETIRED_NEAR_RETURNS                    0x00
+
+EVENT_RETIRED_NEAR_RETURNS_MISP               0xC9     PMC
+UMASK_RETIRED_NEAR_RETURNS_MISP               0x00
+
+EVENT_RETIRED_INDIRECT_BRANCHES_MISP          0xCA     PMC
+UMASK_RETIRED_INDIRECT_BRANCHES_MISP          0x00
+
+EVENT_RETIRED_MMX_FP_INSTR                    0xCB     PMC
+UMASK_RETIRED_MMX_FP_INSTR_X87                0x01
+UMASK_RETIRED_MMX_FP_INSTR_MMX                0x02
+UMASK_RETIRED_MMX_FP_INSTR_SSE                0x04
+UMASK_RETIRED_MMX_FP_INSTR_ALL                0x07
+
+EVENT_RETIRED_COND_BRANCH_INSTR               0xD1 PMC
+UMASK_RETIRED_COND_BRANCH_INSTR               0x00
+
+EVENT_DIV_BUSY_CYCLES                         0xD3 PMC
+UMASK_DIV_BUSY_CYCLES                         0x00
+
+EVENT_DIV_OP_COUNT                            0xD4 PMC
+UMASK_DIV_OP_COUNT                            0x00
+
+EVENT_TAGGED_IBS_OPS                          0x1CF PMC
+UMASK_TAGGED_IBS_OPS_COUNT                    0x01
+UMASK_TAGGED_IBS_OPS_COUNT_RETIRED            0x02
+UMASK_TAGGED_IBS_OPS_COUNT_ROLLOVER           0x04
+
+EVENT_RETIRED_FUSED_BRANCH_INSTR              0x1D0 PMC
+UMASK_RETIRED_FUSED_BRANCH_INSTR              0x00
+
+# L2 Cache Events
+
+EVENT_REQUESTS_TO_L2_GRP1                     0x60 PMC
+UMASK_REQUESTS_TO_L2_GRP1_GRP2                0x01
+UMASK_REQUESTS_TO_L2_GRP1_L2_HW_PREF          0x02
+UMASK_REQUESTS_TO_L2_GRP1_PREF_L2             0x04
+UMASK_REQUESTS_TO_L2_GRP1_CHANGE_TO_X         0x08
+UMASK_REQUESTS_TO_L2_GRP1_CACHEABLE_IC_READ   0x10
+UMASK_REQUESTS_TO_L2_GRP1_LS_RD_BLOCK_C_S     0x20
+UMASK_REQUESTS_TO_L2_GRP1_RD_BLOCK_X          0x40
+UMASK_REQUESTS_TO_L2_GRP1_RD_BLOCK_L          0x80
+
+EVENT_REQUESTS_TO_L2_GRP2                     0x61 PMC
+UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_RESP       0x01
+UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_ORIG       0x02
+UMASK_REQUESTS_TO_L2_GRP2_SMC_INVAL           0x04
+UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED_NC    0x08
+UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED       0x10
+UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED_NC    0x20
+UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED       0x40
+UMASK_REQUESTS_TO_L2_GRP2_GRP1                0x80
+
+EVENT_L2_LATENCY_CYCLES                       0x62 PMC
+UMASK_L2_LATENCY_CYCLES_WAIT_ON_FILLS         0x01
+
+EVENT_CORE_TO_L2_CACHE_REQUESTS                      0x64 PMC
+UMASK_CORE_TO_L2_CACHE_REQUESTS_IC_FILL_MISS         0x01
+UMASK_CORE_TO_L2_CACHE_REQUESTS_IC_FILL_HIT_S        0x02
+UMASK_CORE_TO_L2_CACHE_REQUESTS_IC_FILL_HIT_X        0x04
+UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_C        0x08
+UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_X        0x10
+UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_L_HIT_S  0x20
+UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_L_HIT_X  0x40
+UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_CS       0x80
+
+EVENT_CYCLES_FILL_PEND_FROM_L2            0x6D PMC
+UMASK_CYCLES_FILL_PEND_FROM_L2_BUSY       0x01
+
+EVENT_L2_PF_HIT_IN_L2                     0x70 PMC
+UMASK_L2_PF_HIT_IN_L2                     0x00
+
+EVENT_L2_PF_HIT_IN_L3                     0x71 PMC
+UMASK_L2_PF_HIT_IN_L3                     0x00
+
+EVENT_L2_PF_MISS_IN_L3                    0x72 PMC
+UMASK_L2_PF_MISS_IN_L3                    0x00
+
+# L3 Cache Events
+
+EVENT_L3_ACCESS                           0x01 CPMC
+UMASK_L3_ACCESS                           0x80
+
+EVENT_L3_MISS                             0x06 CPMC
+UMASK_L3_MISS                             0x01
+
+EVENT_L3_CACHE_REQ                        0x04 CPMC
+UMASK_L3_CACHE_REQ                        0xFF
+
+EVENT_L3_MISS_LAT                         0x90 CPMC
+UMASK_L3_MISS_LAT                         0x00
+
+EVENT_L3_MISS_REQ                         0x9A CPMC
+UMASK_L3_MISS_REQ                         0x00
+
+# Energy Events
+
+EVENT_RAPL_CORE_ENERGY                    0x01 PWR0
+UMASK_RAPL_CORE_ENERGY                    0x00
+
+EVENT_RAPL_PKG_ENERGY                     0x02 PWR1
+UMASK_RAPL_PKG_ENERGY                     0x00
+
+# Data fabric events
+
+EVENT_DATA_FROM_LOCAL_DRAM_CHANNEL    0x07 DFC
+UMASK_DATA_FROM_LOCAL_DRAM_CHANNEL    0x38
+
+EVENT_DATA_TO_LOCAL_DRAM_CHANNEL      0x47 DFC
+UMASK_DATA_TO_LOCAL_DRAM_CHANNEL      0x38
+
+EVENT_DATA_OUT_TO_REMOTE_0            0x187 DFC
+UMASK_DATA_OUT_TO_REMOTE_0            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_1            0x1C7 DFC
+UMASK_DATA_OUT_TO_REMOTE_1            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_2            0x207 DFC
+UMASK_DATA_OUT_TO_REMOTE_2            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_3            0x287 DFC
+UMASK_DATA_OUT_TO_REMOTE_3            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_4            0x247 DFC
+UMASK_DATA_OUT_TO_REMOTE_4            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_5            0x2C7 DFC
+UMASK_DATA_OUT_TO_REMOTE_5            0x02
diff --git a/src/includes/perfmon_zen_counters.h b/src/includes/perfmon_zen_counters.h
index 31e57fdb7..6ec0e9911 100644
--- a/src/includes/perfmon_zen_counters.h
+++ b/src/includes/perfmon_zen_counters.h
@@ -84,17 +84,17 @@ static RegisterMap zen_counter_map[NUM_COUNTERS_ZEN] = {
     {"PWR0", PMC13, POWER, 0, MSR_AMD17_RAPL_CORE_STATUS, 0, 0},
     {"PWR1", PMC14, POWER, 0, MSR_AMD17_RAPL_PKG_STATUS, 0, 0},
     /* Northbridge counters */
-    {"UPMC0",PMC15, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
-    {"UPMC1",PMC16, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
-    {"UPMC2",PMC17, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
-    {"UPMC3",PMC18, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+    {"DFC0",PMC15, MBOX0, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
+    {"DFC1",PMC16, MBOX0, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
+    {"DFC2",PMC17, MBOX0, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
+    {"DFC3",PMC18, MBOX0, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
 };
 
 static BoxMap zen_box_map[NUM_UNITS] = {
     [FIXED] = {0, 0, 0, 0, 0, 0, 64},
     [PMC] = {0, 0, 0, 0, 0, 0, 48},
     [CBOX0] = {0, 0, 0, 0, 0, 0, 48},
-    [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+    [MBOX0] = {0, 0, 0, 0, 0, 0, 48},
     [POWER] = {0, 0, 0, 0, 0, 0, 32},
 };
 
@@ -103,5 +103,5 @@ static char* zen_translate_types[NUM_UNITS] = {
     [PMC] = "/sys/bus/event_source/devices/cpu",
     [POWER] = "/sys/bus/event_source/devices/power",
     [CBOX0] = "/sys/bus/event_source/devices/amd_l3",
-    [UNCORE] = "/sys/bus/event_source/devices/amd_nb",
+    [MBOX0] = "/sys/bus/event_source/devices/amd_df",
 };
diff --git a/src/includes/perfmon_zen_events.txt b/src/includes/perfmon_zen_events.txt
index 8ba52e10c..03ce74667 100644
--- a/src/includes/perfmon_zen_events.txt
+++ b/src/includes/perfmon_zen_events.txt
@@ -116,6 +116,7 @@ EVENT_LS_DISPATCH                      0x29    PMC
 UMASK_LS_DISPATCH_LOADS                0x01
 UMASK_LS_DISPATCH_STORES               0x02
 UMASK_LS_DISPATCH_LOAD_OP_STORES       0x04
+UMASK_LS_DISPATCH_ANY                  0x07
 
 EVENT_SMIS_RECEIVED                    0x2B PMC
 UMASK_SMIS_RECEIVED                    0x00
@@ -316,25 +317,29 @@ UMASK_RETIRED_FUSED_BRANCH_INSTR            0x00
 
 # L2 Cache Events
 
-EVENT_REQUESTS_TO_L2_GRP1                0x60 PMC
-UMASK_REQUESTS_TO_L2_GRP1_OTHER            0x01
+EVENT_REQUESTS_TO_L2_GRP1                   0x60 PMC
+UMASK_REQUESTS_TO_L2_GRP1_OTHER             0x01
 UMASK_REQUESTS_TO_L2_GRP1_L2_HW_PREF        0x02
-UMASK_REQUESTS_TO_L2_GRP1_PREF_L2            0x04
-UMASK_REQUESTS_TO_L2_GRP1_CHANGE_TO_X        0x08
-UMASK_REQUESTS_TO_L2_GRP1_CACHEABLE_IC_READ    0x10
-UMASK_REQUESTS_TO_L2_GRP1_LS_RD_BLOCK_C_S        0x20
+UMASK_REQUESTS_TO_L2_GRP1_PREF_L2           0x04
+UMASK_REQUESTS_TO_L2_GRP1_CHANGE_TO_X       0x08
+UMASK_REQUESTS_TO_L2_GRP1_CACHEABLE_IC_READ 0x10
+UMASK_REQUESTS_TO_L2_GRP1_LS_RD_BLOCK_C_S   0x20
 UMASK_REQUESTS_TO_L2_GRP1_RD_BLOCK_X        0x40
 UMASK_REQUESTS_TO_L2_GRP1_RD_BLOCK_L        0x80
-
-EVENT_REQUESTS_TO_L2_GRP2                     0x61 PMC
-UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_RESP       0x01
-UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_ORIG       0x02
-UMASK_REQUESTS_TO_L2_GRP2_SMC_INVAL           0x04
-UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED_NC    0x08
-UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED       0x10
-UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED_NC    0x20
-UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED       0x40
-UMASK_REQUESTS_TO_L2_GRP2_GRP1                0x80
+UMASK_REQUESTS_TO_L2_GRP1_ALL               0xFF
+UMASK_REQUESTS_TO_L2_GRP1_ALL_NO_PF         0xF9
+UMASK_REQUESTS_TO_L2_GRP1_DATA_CACHE_MISS   0xC8
+
+EVENT_REQUESTS_TO_L2_GRP2                   0x61 PMC
+UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_RESP     0x01
+UMASK_REQUESTS_TO_L2_GRP2_BUS_LOCK_ORIG     0x02
+UMASK_REQUESTS_TO_L2_GRP2_SMC_INVAL         0x04
+UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED_NC  0x08
+UMASK_REQUESTS_TO_L2_GRP2_IC_READ_SIZED     0x10
+UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED_NC  0x20
+UMASK_REQUESTS_TO_L2_GRP2_LS_READ_SIZED     0x40
+UMASK_REQUESTS_TO_L2_GRP2_GRP1              0x80
+UMASK_REQUESTS_TO_L2_GRP2_IC_READ           0x18
 
 EVENT_L2_LATENCY_CYCLES                     0x62 PMC
 UMASK_L2_LATENCY_CYCLES_WAIT_ON_FILLS        0x01
@@ -361,6 +366,18 @@ UMASK_CORE_TO_L2_CACHE_REQUESTS_LD_READ_BLK_CS       0x80
 EVENT_CYCLES_FILL_PEND_FROM_L2            0x6D PMC
 UMASK_CYCLES_FILL_PEND_FROM_L2_BUSY       0x01
 
+EVENT_L2_INST_CACHE_MISS        0x64
+UMASK_L2_INST_CACHE_MISS        0x01
+
+EVENT_L2_DATA_CACHE_MISS        0x64
+UMASK_L2_DATA_CACHE_MISS        0x08
+
+EVENT_L1_INST_MISS_L2_HIT       0x64
+UMASK_L1_INST_MISS_L2_HIT       0x06
+
+EVENT_L1_DATA_MISS_L2_HIT       0x64
+UMASK_L1_DATA_MISS_L2_HIT       0x70
+
 # L3 Cache Events
 
 EVENT_L3_ACCESS                        0x01 CPMC
@@ -376,3 +393,29 @@ UMASK_RAPL_CORE_ENERGY                  0x00
 
 EVENT_RAPL_PKG_ENERGY                  0x02 PWR1
 UMASK_RAPL_PKG_ENERGY                  0x00
+
+# Data fabric events
+
+EVENT_DATA_FROM_LOCAL_DRAM_CHANNEL    0x07 DFC
+UMASK_DATA_FROM_LOCAL_DRAM_CHANNEL    0x38
+
+EVENT_DATA_TO_LOCAL_DRAM_CHANNEL      0x47 DFC
+UMASK_DATA_TO_LOCAL_DRAM_CHANNEL      0x38
+
+EVENT_DATA_OUT_TO_REMOTE_0            0x187 DFC
+UMASK_DATA_OUT_TO_REMOTE_0            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_1            0x1C7 DFC
+UMASK_DATA_OUT_TO_REMOTE_1            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_2            0x207 DFC
+UMASK_DATA_OUT_TO_REMOTE_2            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_3            0x287 DFC
+UMASK_DATA_OUT_TO_REMOTE_3            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_4            0x247 DFC
+UMASK_DATA_OUT_TO_REMOTE_4            0x02
+
+EVENT_DATA_OUT_TO_REMOTE_5            0x2C7 DFC
+UMASK_DATA_OUT_TO_REMOTE_5            0x02
diff --git a/src/includes/registers.h b/src/includes/registers.h
index 1c17b0167..ea1ac12d6 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -170,7 +170,7 @@
 #define MSR_MIC2_PERFEVTSEL0          0x186
 #define MSR_MIC2_PERFEVTSEL1          0x187
 #define MSR_MIC2_TURBO_RATIO_LIMIT    0x1AD
-#define MSR_MIC2_SPFLT_CONTROL	      0x02C
+#define MSR_MIC2_SPFLT_CONTROL          0x02C
 #define MSR_MIC2_PERF_GLOBAL_STATUS    0x02D
 #define MSR_MIC2_PERF_GLOBAL_OVF_CTRL  0x02E
 #define MSR_MIC2_PERF_GLOBAL_CTRL      0x02F
@@ -178,13 +178,13 @@
 #define MSR_MIC2_U_GLOBAL_CTRL      0x700
 #define MSR_MIC2_U_GLOBAL_STATUS    0x701
 #define MSR_MIC2_U_CONFIG           0x702
-#define MSR_MIC2_U_FIXED_CTRL	      0x703
-#define MSR_MIC2_U_FIXED_CTR	      0x704
-#define MSR_MIC2_U_CTRL0	      0x705
-#define MSR_MIC2_U_CTRL1	      0x706
-#define MSR_MIC2_U_OVFL	      0x708
-#define MSR_MIC2_U_CTR0	      0x709
-#define MSR_MIC2_U_CTR1	      0x70A
+#define MSR_MIC2_U_FIXED_CTRL          0x703
+#define MSR_MIC2_U_FIXED_CTR          0x704
+#define MSR_MIC2_U_CTRL0          0x705
+#define MSR_MIC2_U_CTRL1          0x706
+#define MSR_MIC2_U_OVFL          0x708
+#define MSR_MIC2_U_CTR0          0x709
+#define MSR_MIC2_U_CTR1          0x70A
 /* Xeon Phi (Knights Landing) WBOX*/
 #define MSR_MIC2_PCU_GLOBAL_CTRL    0x710
 #define MSR_MIC2_PCU_CTRL0   0x711
@@ -653,97 +653,97 @@
 #define MSR_MIC2_C37_FILTER1          0xFC2
 #define MSR_MIC2_C37_STATUS           0xFC3
 /* Xeon Phi (Knights Landing) Embedded DRAM controller aka High Bandwidth Memory */
-#define PCI_MIC2_EDC_U_CTR0_A	     0x404
-#define PCI_MIC2_EDC_U_CTR0_B	     0x400
-#define PCI_MIC2_EDC_U_CTR1_A	     0x40C
-#define PCI_MIC2_EDC_U_CTR1_B	     0x408
-#define PCI_MIC2_EDC_U_CTR2_A	     0x414
-#define PCI_MIC2_EDC_U_CTR2_B	     0x410
-#define PCI_MIC2_EDC_U_CTR3_A	     0x41C
-#define PCI_MIC2_EDC_U_CTR3_B	     0x418
-#define PCI_MIC2_EDC_U_CTRL0	     0x420
-#define PCI_MIC2_EDC_U_CTRL1	     0x424
-#define PCI_MIC2_EDC_U_CTRL2	     0x428
-#define PCI_MIC2_EDC_U_CTRL3	     0x42C
-#define PCI_MIC2_EDC_U_BOX_CTRL	     0x430
+#define PCI_MIC2_EDC_U_CTR0_A         0x404
+#define PCI_MIC2_EDC_U_CTR0_B         0x400
+#define PCI_MIC2_EDC_U_CTR1_A         0x40C
+#define PCI_MIC2_EDC_U_CTR1_B         0x408
+#define PCI_MIC2_EDC_U_CTR2_A         0x414
+#define PCI_MIC2_EDC_U_CTR2_B         0x410
+#define PCI_MIC2_EDC_U_CTR3_A         0x41C
+#define PCI_MIC2_EDC_U_CTR3_B         0x418
+#define PCI_MIC2_EDC_U_CTRL0         0x420
+#define PCI_MIC2_EDC_U_CTRL1         0x424
+#define PCI_MIC2_EDC_U_CTRL2         0x428
+#define PCI_MIC2_EDC_U_CTRL3         0x42C
+#define PCI_MIC2_EDC_U_BOX_CTRL         0x430
 #define PCI_MIC2_EDC_U_BOX_STATUS    0x434
 #define PCI_MIC2_EDC_U_FIXED_CTR_A   0x450
 #define PCI_MIC2_EDC_U_FIXED_CTR_B   0x44C
 #define PCI_MIC2_EDC_U_FIXED_CTRL    0x454
-#define PCI_MIC2_EDC_D_CTR0_A	     0xA04
-#define PCI_MIC2_EDC_D_CTR0_B	     0xA00
-#define PCI_MIC2_EDC_D_CTR1_A	     0xA0C
-#define PCI_MIC2_EDC_D_CTR1_B	     0xA08
-#define PCI_MIC2_EDC_D_CTR2_A	     0xA14
-#define PCI_MIC2_EDC_D_CTR2_B	     0xA10
-#define PCI_MIC2_EDC_D_CTR3_A	     0xA1C
-#define PCI_MIC2_EDC_D_CTR3_B	     0xA18
-#define PCI_MIC2_EDC_D_CTRL0	     0xA20
-#define PCI_MIC2_EDC_D_CTRL1	     0xA24
-#define PCI_MIC2_EDC_D_CTRL2	     0xA28
-#define PCI_MIC2_EDC_D_CTRL3	     0xA2C
-#define PCI_MIC2_EDC_D_BOX_CTRL	     0xA30
+#define PCI_MIC2_EDC_D_CTR0_A         0xA04
+#define PCI_MIC2_EDC_D_CTR0_B         0xA00
+#define PCI_MIC2_EDC_D_CTR1_A         0xA0C
+#define PCI_MIC2_EDC_D_CTR1_B         0xA08
+#define PCI_MIC2_EDC_D_CTR2_A         0xA14
+#define PCI_MIC2_EDC_D_CTR2_B         0xA10
+#define PCI_MIC2_EDC_D_CTR3_A         0xA1C
+#define PCI_MIC2_EDC_D_CTR3_B         0xA18
+#define PCI_MIC2_EDC_D_CTRL0         0xA20
+#define PCI_MIC2_EDC_D_CTRL1         0xA24
+#define PCI_MIC2_EDC_D_CTRL2         0xA28
+#define PCI_MIC2_EDC_D_CTRL3         0xA2C
+#define PCI_MIC2_EDC_D_BOX_CTRL         0xA30
 #define PCI_MIC2_EDC_D_BOX_STATUS    0xA34
 #define PCI_MIC2_EDC_D_FIXED_CTR_A   0xA40
 #define PCI_MIC2_EDC_D_FIXED_CTR_B   0xA3C
 #define PCI_MIC2_EDC_D_FIXED_CTRL    0xA44
 /* Xeon Phi (Knights Landing) Memory controller*/
-#define PCI_MIC2_MC_U_CTR0_A	     0x404
-#define PCI_MIC2_MC_U_CTR0_B	     0x400
-#define PCI_MIC2_MC_U_CTR1_A	     0x40C
-#define PCI_MIC2_MC_U_CTR1_B	     0x408
-#define PCI_MIC2_MC_U_CTR2_A	     0x414
-#define PCI_MIC2_MC_U_CTR2_B	     0x410
-#define PCI_MIC2_MC_U_CTR3_A	     0x41C
-#define PCI_MIC2_MC_U_CTR3_B	     0x418
-#define PCI_MIC2_MC_U_CTRL0	     0x420
-#define PCI_MIC2_MC_U_CTRL1	     0x424
-#define PCI_MIC2_MC_U_CTRL2	     0x428
-#define PCI_MIC2_MC_U_CTRL3	     0x42C
-#define PCI_MIC2_MC_U_BOX_CTRL	     0x430
+#define PCI_MIC2_MC_U_CTR0_A         0x404
+#define PCI_MIC2_MC_U_CTR0_B         0x400
+#define PCI_MIC2_MC_U_CTR1_A         0x40C
+#define PCI_MIC2_MC_U_CTR1_B         0x408
+#define PCI_MIC2_MC_U_CTR2_A         0x414
+#define PCI_MIC2_MC_U_CTR2_B         0x410
+#define PCI_MIC2_MC_U_CTR3_A         0x41C
+#define PCI_MIC2_MC_U_CTR3_B         0x418
+#define PCI_MIC2_MC_U_CTRL0         0x420
+#define PCI_MIC2_MC_U_CTRL1         0x424
+#define PCI_MIC2_MC_U_CTRL2         0x428
+#define PCI_MIC2_MC_U_CTRL3         0x42C
+#define PCI_MIC2_MC_U_BOX_CTRL         0x430
 #define PCI_MIC2_MC_U_BOX_STATUS    0x434
 #define PCI_MIC2_MC_U_FIXED_CTR_A   0x450
 #define PCI_MIC2_MC_U_FIXED_CTR_B   0x44C
 #define PCI_MIC2_MC_U_FIXED_CTRL    0x454
-#define PCI_MIC2_MC_D_CTR0_A	     0xB04
-#define PCI_MIC2_MC_D_CTR0_B	     0xB00
-#define PCI_MIC2_MC_D_CTR1_A	     0xB0C
-#define PCI_MIC2_MC_D_CTR1_B	     0xB08
-#define PCI_MIC2_MC_D_CTR2_A	     0xB14
-#define PCI_MIC2_MC_D_CTR2_B	     0xB10
-#define PCI_MIC2_MC_D_CTR3_A	     0xB1C
-#define PCI_MIC2_MC_D_CTR3_B	     0xB18
-#define PCI_MIC2_MC_D_CTRL0	     0xB20
-#define PCI_MIC2_MC_D_CTRL1	     0xB24
-#define PCI_MIC2_MC_D_CTRL2	     0xB28
-#define PCI_MIC2_MC_D_CTRL3	     0xB2C
-#define PCI_MIC2_MC_D_BOX_CTRL	     0xB30
+#define PCI_MIC2_MC_D_CTR0_A         0xB04
+#define PCI_MIC2_MC_D_CTR0_B         0xB00
+#define PCI_MIC2_MC_D_CTR1_A         0xB0C
+#define PCI_MIC2_MC_D_CTR1_B         0xB08
+#define PCI_MIC2_MC_D_CTR2_A         0xB14
+#define PCI_MIC2_MC_D_CTR2_B         0xB10
+#define PCI_MIC2_MC_D_CTR3_A         0xB1C
+#define PCI_MIC2_MC_D_CTR3_B         0xB18
+#define PCI_MIC2_MC_D_CTRL0         0xB20
+#define PCI_MIC2_MC_D_CTRL1         0xB24
+#define PCI_MIC2_MC_D_CTRL2         0xB28
+#define PCI_MIC2_MC_D_CTRL3         0xB2C
+#define PCI_MIC2_MC_D_BOX_CTRL         0xB30
 #define PCI_MIC2_MC_D_BOX_STATUS    0xB34
 #define PCI_MIC2_MC_D_FIXED_CTR_A   0xB40
 #define PCI_MIC2_MC_D_FIXED_CTR_B   0xB3C
 #define PCI_MIC2_MC_D_FIXED_CTRL    0xB44
 /* Xeon Phi (Knights Landing) M2PCIE */
-#define PCI_MIC2_M2PCIE_CTR0_A	    0xA4
-#define PCI_MIC2_M2PCIE_CTR0_B	    0xA0
-#define PCI_MIC2_M2PCIE_CTR1_A	    0xAC
-#define PCI_MIC2_M2PCIE_CTR1_B	    0xA8
-#define PCI_MIC2_M2PCIE_CTR2_A	    0xB4
-#define PCI_MIC2_M2PCIE_CTR2_B	    0xB0
-#define PCI_MIC2_M2PCIE_CTR3_A	    0xBC
-#define PCI_MIC2_M2PCIE_CTR3_B	    0xB8
-#define PCI_MIC2_M2PCIE_CTRL0	    0xD8
-#define PCI_MIC2_M2PCIE_CTRL1	    0xDC
-#define PCI_MIC2_M2PCIE_CTRL2	    0xE0
-#define PCI_MIC2_M2PCIE_CTRL3	    0xE4
+#define PCI_MIC2_M2PCIE_CTR0_A        0xA4
+#define PCI_MIC2_M2PCIE_CTR0_B        0xA0
+#define PCI_MIC2_M2PCIE_CTR1_A        0xAC
+#define PCI_MIC2_M2PCIE_CTR1_B        0xA8
+#define PCI_MIC2_M2PCIE_CTR2_A        0xB4
+#define PCI_MIC2_M2PCIE_CTR2_B        0xB0
+#define PCI_MIC2_M2PCIE_CTR3_A        0xBC
+#define PCI_MIC2_M2PCIE_CTR3_B        0xB8
+#define PCI_MIC2_M2PCIE_CTRL0        0xD8
+#define PCI_MIC2_M2PCIE_CTRL1        0xDC
+#define PCI_MIC2_M2PCIE_CTRL2        0xE0
+#define PCI_MIC2_M2PCIE_CTRL3        0xE4
 #define PCI_MIC2_M2PCIE_BOX_CTRL    0xF4
 #define PCI_MIC2_M2PCIE_BOX_STATUS  0xF8
 /* Xeon Phi (Knights Landing) IRP */
-#define PCI_MIC2_IRP_CTR0	    0xA0
-#define PCI_MIC2_IRP_CTR1	    0xA8
-#define PCI_MIC2_IRP_CTRL0	    0xD8
-#define PCI_MIC2_IRP_CTRL1	    0xDC
-#define PCI_MIC2_IRP_BOX_CTRL	    0xF0
-#define PCI_MIC2_IRP_BOX_STATUS	    0xF4
+#define PCI_MIC2_IRP_CTR0        0xA0
+#define PCI_MIC2_IRP_CTR1        0xA8
+#define PCI_MIC2_IRP_CTRL0        0xD8
+#define PCI_MIC2_IRP_CTRL1        0xDC
+#define PCI_MIC2_IRP_BOX_CTRL        0xF0
+#define PCI_MIC2_IRP_BOX_STATUS        0xF4
 
 /* Core v1/v2 type uncore
  * Naming following Intel Uncore Performance Monitoring Guide
@@ -1577,40 +1577,40 @@
 #define MSR_UNC_SKX_IRP_CBDMA_BOX_CTL        0xA58
 #define MSR_UNC_SKX_IRP_CBDMA_BOX_STATUS     0xA5F
 
-#define	MSR_UNC_SKX_IRP_PCIE0_CTL0	0xA7B
-#define	MSR_UNC_SKX_IRP_PCIE0_CTL1	0xA7C
-#define	MSR_UNC_SKX_IRP_PCIE0_CTR0	0xA79
-#define	MSR_UNC_SKX_IRP_PCIE0_CTR1	0xA7A
-#define	MSR_UNC_SKX_IRP_PCIE0_BOX_CTL	0xA78
-#define	MSR_UNC_SKX_IRP_PCIE0_BOX_STATUS	0xA7F
-
-#define	MSR_UNC_SKX_IRP_PCIE1_CTL0	0xA9B
-#define	MSR_UNC_SKX_IRP_PCIE1_CTL1	0xA9C
-#define	MSR_UNC_SKX_IRP_PCIE1_CTR0	0xA99
-#define	MSR_UNC_SKX_IRP_PCIE1_CTR1	0xA9A
-#define	MSR_UNC_SKX_IRP_PCIE1_BOX_CTL	0xA98
-#define	MSR_UNC_SKX_IRP_PCIE1_BOX_STATUS	0xA9F
-
-#define	MSR_UNC_SKX_IRP_PCIE2_CTL0	0xABB
-#define	MSR_UNC_SKX_IRP_PCIE2_CTL1	0xABC
-#define	MSR_UNC_SKX_IRP_PCIE2_CTR0	0xAB9
-#define	MSR_UNC_SKX_IRP_PCIE2_CTR1	0xABA
-#define	MSR_UNC_SKX_IRP_PCIE2_BOX_CTL	0xAB8
-#define	MSR_UNC_SKX_IRP_PCIE2_BOX_STATUS	0xABF
-
-#define	MSR_UNC_SKX_IRP_MCP0_CTL0	0xADB
-#define	MSR_UNC_SKX_IRP_MCP0_CTL1	0xADC
-#define	MSR_UNC_SKX_IRP_MCP0_CTR0	0xAD9
-#define	MSR_UNC_SKX_IRP_MCP0_CTR1	0xADA
-#define	MSR_UNC_SKX_IRP_MCP0_BOX_CTL	0xAD8
-#define	MSR_UNC_SKX_IRP_MCP0_BOX_STATUS	0xADF
-
-#define	MSR_UNC_SKX_IRP_MCP1_CTL0	0xAFB
-#define	MSR_UNC_SKX_IRP_MCP1_CTL1	0xAFC
-#define	MSR_UNC_SKX_IRP_MCP1_CTR0	0xAF9
-#define	MSR_UNC_SKX_IRP_MCP1_CTR1	0xAFA
-#define	MSR_UNC_SKX_IRP_MCP1_BOX_CTL	0xAF8
-#define	MSR_UNC_SKX_IRP_MCP1_BOX_STATUS	0xAFF
+#define    MSR_UNC_SKX_IRP_PCIE0_CTL0    0xA7B
+#define    MSR_UNC_SKX_IRP_PCIE0_CTL1    0xA7C
+#define    MSR_UNC_SKX_IRP_PCIE0_CTR0    0xA79
+#define    MSR_UNC_SKX_IRP_PCIE0_CTR1    0xA7A
+#define    MSR_UNC_SKX_IRP_PCIE0_BOX_CTL    0xA78
+#define    MSR_UNC_SKX_IRP_PCIE0_BOX_STATUS    0xA7F
+
+#define    MSR_UNC_SKX_IRP_PCIE1_CTL0    0xA9B
+#define    MSR_UNC_SKX_IRP_PCIE1_CTL1    0xA9C
+#define    MSR_UNC_SKX_IRP_PCIE1_CTR0    0xA99
+#define    MSR_UNC_SKX_IRP_PCIE1_CTR1    0xA9A
+#define    MSR_UNC_SKX_IRP_PCIE1_BOX_CTL    0xA98
+#define    MSR_UNC_SKX_IRP_PCIE1_BOX_STATUS    0xA9F
+
+#define    MSR_UNC_SKX_IRP_PCIE2_CTL0    0xABB
+#define    MSR_UNC_SKX_IRP_PCIE2_CTL1    0xABC
+#define    MSR_UNC_SKX_IRP_PCIE2_CTR0    0xAB9
+#define    MSR_UNC_SKX_IRP_PCIE2_CTR1    0xABA
+#define    MSR_UNC_SKX_IRP_PCIE2_BOX_CTL    0xAB8
+#define    MSR_UNC_SKX_IRP_PCIE2_BOX_STATUS    0xABF
+
+#define    MSR_UNC_SKX_IRP_MCP0_CTL0    0xADB
+#define    MSR_UNC_SKX_IRP_MCP0_CTL1    0xADC
+#define    MSR_UNC_SKX_IRP_MCP0_CTR0    0xAD9
+#define    MSR_UNC_SKX_IRP_MCP0_CTR1    0xADA
+#define    MSR_UNC_SKX_IRP_MCP0_BOX_CTL    0xAD8
+#define    MSR_UNC_SKX_IRP_MCP0_BOX_STATUS    0xADF
+
+#define    MSR_UNC_SKX_IRP_MCP1_CTL0    0xAFB
+#define    MSR_UNC_SKX_IRP_MCP1_CTL1    0xAFC
+#define    MSR_UNC_SKX_IRP_MCP1_CTR0    0xAF9
+#define    MSR_UNC_SKX_IRP_MCP1_CTR1    0xAFA
+#define    MSR_UNC_SKX_IRP_MCP1_BOX_CTL    0xAF8
+#define    MSR_UNC_SKX_IRP_MCP1_BOX_STATUS    0xAFF
 
 /* SKX iMC (Memory controller) */
 #define PCI_UNC_SKX_MC_PMON_BOX_CTL         0xF4
@@ -1681,65 +1681,65 @@
 #define MSR_UNC_SKX_II0_CBDMA_BOX_STATUS     0xA47
 #define MSR_UNC_SKX_II0_CBDMA_CLOCK          0xA45
 
-#define	MSR_UNC_SKX_II0_PCIE0_CTL0	0xA68
-#define	MSR_UNC_SKX_II0_PCIE0_CTL1	0xA69
-#define	MSR_UNC_SKX_II0_PCIE0_CTL2	0xA6A
-#define	MSR_UNC_SKX_II0_PCIE0_CTL3	0xA6B
-#define	MSR_UNC_SKX_II0_PCIE0_CTR0	0xA61
-#define	MSR_UNC_SKX_II0_PCIE0_CTR1	0xA62
-#define	MSR_UNC_SKX_II0_PCIE0_CTR2	0xA63
-#define	MSR_UNC_SKX_II0_PCIE0_CTR3	0xA64
-#define	MSR_UNC_SKX_II0_PCIE0_BOX_CTL	0xA60
-#define	MSR_UNC_SKX_II0_PCIE0_BOX_STATUS	0xA67
-#define	MSR_UNC_SKX_II0_PCIE0_CLOCK	0xA65
-
-#define	MSR_UNC_SKX_II0_PCIE1_CTL0	0xA88
-#define	MSR_UNC_SKX_II0_PCIE1_CTL1	0xA89
-#define	MSR_UNC_SKX_II0_PCIE1_CTL2	0xA8A
-#define	MSR_UNC_SKX_II0_PCIE1_CTL3	0xA8B
-#define	MSR_UNC_SKX_II0_PCIE1_CTR0	0xA81
-#define	MSR_UNC_SKX_II0_PCIE1_CTR1	0xA82
-#define	MSR_UNC_SKX_II0_PCIE1_CTR2	0xA83
-#define	MSR_UNC_SKX_II0_PCIE1_CTR3	0xA84
-#define	MSR_UNC_SKX_II0_PCIE1_BOX_CTL	0xA80
-#define	MSR_UNC_SKX_II0_PCIE1_BOX_STATUS	0xA87
-#define	MSR_UNC_SKX_II0_PCIE1_CLOCK	0xA85
-
-#define	MSR_UNC_SKX_II0_PCIE2_CTL0	0xAA8
-#define	MSR_UNC_SKX_II0_PCIE2_CTL1	0xAA9
-#define	MSR_UNC_SKX_II0_PCIE2_CTL2	0xAAA
-#define	MSR_UNC_SKX_II0_PCIE2_CTL3	0xAAB
-#define	MSR_UNC_SKX_II0_PCIE2_CTR0	0xAA1
-#define	MSR_UNC_SKX_II0_PCIE2_CTR1	0xAA2
-#define	MSR_UNC_SKX_II0_PCIE2_CTR2	0xAA3
-#define	MSR_UNC_SKX_II0_PCIE2_CTR3	0xAA4
-#define	MSR_UNC_SKX_II0_PCIE2_BOX_CTL	0xAA0
-#define	MSR_UNC_SKX_II0_PCIE2_BOX_STATUS	0xAA7
-#define	MSR_UNC_SKX_II0_PCIE2_CLOCK	0xAA5
-
-#define	MSR_UNC_SKX_II0_MCP0_CTL0	0xAC8
-#define	MSR_UNC_SKX_II0_MCP0_CTL1	0xAC9
-#define	MSR_UNC_SKX_II0_MCP0_CTL2	0xACA
-#define	MSR_UNC_SKX_II0_MCP0_CTL3	0xACB
-#define	MSR_UNC_SKX_II0_MCP0_CTR0	0xAC1
-#define	MSR_UNC_SKX_II0_MCP0_CTR1	0xAC2
-#define	MSR_UNC_SKX_II0_MCP0_CTR2	0xAC3
-#define	MSR_UNC_SKX_II0_MCP0_CTR3	0xAC4
-#define	MSR_UNC_SKX_II0_MCP0_BOX_CTL	0xAC0
-#define	MSR_UNC_SKX_II0_MCP0_BOX_STATUS	0xAC7
-#define	MSR_UNC_SKX_II0_MCP0_CLOCK	0xAC5
-
-#define	MSR_UNC_SKX_II0_MCP1_CTL0	0xAE8
-#define	MSR_UNC_SKX_II0_MCP1_CTL1	0xAE9
-#define	MSR_UNC_SKX_II0_MCP1_CTL2	0xAEA
-#define	MSR_UNC_SKX_II0_MCP1_CTL3	0xAEB
-#define	MSR_UNC_SKX_II0_MCP1_CTR0	0xAE1
-#define	MSR_UNC_SKX_II0_MCP1_CTR1	0xAE2
-#define	MSR_UNC_SKX_II0_MCP1_CTR2	0xAE3
-#define	MSR_UNC_SKX_II0_MCP1_CTR3	0xAE4
-#define	MSR_UNC_SKX_II0_MCP1_BOX_CTL	0xAE0
-#define	MSR_UNC_SKX_II0_MCP1_BOX_STATUS	0xAE7
-#define	MSR_UNC_SKX_II0_MCP1_CLOCK	0xAE5
+#define    MSR_UNC_SKX_II0_PCIE0_CTL0    0xA68
+#define    MSR_UNC_SKX_II0_PCIE0_CTL1    0xA69
+#define    MSR_UNC_SKX_II0_PCIE0_CTL2    0xA6A
+#define    MSR_UNC_SKX_II0_PCIE0_CTL3    0xA6B
+#define    MSR_UNC_SKX_II0_PCIE0_CTR0    0xA61
+#define    MSR_UNC_SKX_II0_PCIE0_CTR1    0xA62
+#define    MSR_UNC_SKX_II0_PCIE0_CTR2    0xA63
+#define    MSR_UNC_SKX_II0_PCIE0_CTR3    0xA64
+#define    MSR_UNC_SKX_II0_PCIE0_BOX_CTL    0xA60
+#define    MSR_UNC_SKX_II0_PCIE0_BOX_STATUS    0xA67
+#define    MSR_UNC_SKX_II0_PCIE0_CLOCK    0xA65
+
+#define    MSR_UNC_SKX_II0_PCIE1_CTL0    0xA88
+#define    MSR_UNC_SKX_II0_PCIE1_CTL1    0xA89
+#define    MSR_UNC_SKX_II0_PCIE1_CTL2    0xA8A
+#define    MSR_UNC_SKX_II0_PCIE1_CTL3    0xA8B
+#define    MSR_UNC_SKX_II0_PCIE1_CTR0    0xA81
+#define    MSR_UNC_SKX_II0_PCIE1_CTR1    0xA82
+#define    MSR_UNC_SKX_II0_PCIE1_CTR2    0xA83
+#define    MSR_UNC_SKX_II0_PCIE1_CTR3    0xA84
+#define    MSR_UNC_SKX_II0_PCIE1_BOX_CTL    0xA80
+#define    MSR_UNC_SKX_II0_PCIE1_BOX_STATUS    0xA87
+#define    MSR_UNC_SKX_II0_PCIE1_CLOCK    0xA85
+
+#define    MSR_UNC_SKX_II0_PCIE2_CTL0    0xAA8
+#define    MSR_UNC_SKX_II0_PCIE2_CTL1    0xAA9
+#define    MSR_UNC_SKX_II0_PCIE2_CTL2    0xAAA
+#define    MSR_UNC_SKX_II0_PCIE2_CTL3    0xAAB
+#define    MSR_UNC_SKX_II0_PCIE2_CTR0    0xAA1
+#define    MSR_UNC_SKX_II0_PCIE2_CTR1    0xAA2
+#define    MSR_UNC_SKX_II0_PCIE2_CTR2    0xAA3
+#define    MSR_UNC_SKX_II0_PCIE2_CTR3    0xAA4
+#define    MSR_UNC_SKX_II0_PCIE2_BOX_CTL    0xAA0
+#define    MSR_UNC_SKX_II0_PCIE2_BOX_STATUS    0xAA7
+#define    MSR_UNC_SKX_II0_PCIE2_CLOCK    0xAA5
+
+#define    MSR_UNC_SKX_II0_MCP0_CTL0    0xAC8
+#define    MSR_UNC_SKX_II0_MCP0_CTL1    0xAC9
+#define    MSR_UNC_SKX_II0_MCP0_CTL2    0xACA
+#define    MSR_UNC_SKX_II0_MCP0_CTL3    0xACB
+#define    MSR_UNC_SKX_II0_MCP0_CTR0    0xAC1
+#define    MSR_UNC_SKX_II0_MCP0_CTR1    0xAC2
+#define    MSR_UNC_SKX_II0_MCP0_CTR2    0xAC3
+#define    MSR_UNC_SKX_II0_MCP0_CTR3    0xAC4
+#define    MSR_UNC_SKX_II0_MCP0_BOX_CTL    0xAC0
+#define    MSR_UNC_SKX_II0_MCP0_BOX_STATUS    0xAC7
+#define    MSR_UNC_SKX_II0_MCP0_CLOCK    0xAC5
+
+#define    MSR_UNC_SKX_II0_MCP1_CTL0    0xAE8
+#define    MSR_UNC_SKX_II0_MCP1_CTL1    0xAE9
+#define    MSR_UNC_SKX_II0_MCP1_CTL2    0xAEA
+#define    MSR_UNC_SKX_II0_MCP1_CTL3    0xAEB
+#define    MSR_UNC_SKX_II0_MCP1_CTR0    0xAE1
+#define    MSR_UNC_SKX_II0_MCP1_CTR1    0xAE2
+#define    MSR_UNC_SKX_II0_MCP1_CTR2    0xAE3
+#define    MSR_UNC_SKX_II0_MCP1_CTR3    0xAE4
+#define    MSR_UNC_SKX_II0_MCP1_BOX_CTL    0xAE0
+#define    MSR_UNC_SKX_II0_MCP1_BOX_STATUS    0xAE7
+#define    MSR_UNC_SKX_II0_MCP1_CLOCK    0xAE5
 
 /* SKX Free-Running IIO Bandwidth Counters */
 #define MSR_UNC_SKX_II0_CBDMA_BAND_PORT0_IN      0xB00
@@ -1751,50 +1751,50 @@
 #define MSR_UNC_SKX_II0_CBDMA_BAND_PORT2_OUT     0xB06
 #define MSR_UNC_SKX_II0_CBDMA_BAND_PORT3_OUT     0xB07
 
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT0_IN	0xB10
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT1_IN	0xB11
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT2_IN	0xB12
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT3_IN	0xB13
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT0_OUT	0xB14
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT1_OUT	0xB15
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT2_OUT	0xB16
-#define	MSR_UNC_SKX_II0_PCIE0_BAND_PORT3_OUT	0xB17
-
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT0_IN	0xB20
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT1_IN	0xB21
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT2_IN	0xB22
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT3_IN	0xB23
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT0_OUT	0xB24
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT1_OUT	0xB25
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT2_OUT	0xB26
-#define	MSR_UNC_SKX_II0_PCIE1_BAND_PORT3_OUT	0xB27
-
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT0_IN	0xB30
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT1_IN	0xB31
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT2_IN	0xB32
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT3_IN	0xB33
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT0_OUT	0xB34
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT1_OUT	0xB35
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT2_OUT	0xB36
-#define	MSR_UNC_SKX_II0_PCIE2_BAND_PORT3_OUT	0xB37
-
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT0_IN	0xB40
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT1_IN	0xB41
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT2_IN	0xB42
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT3_IN	0xB43
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT0_OUT	0xB44
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT1_OUT	0xB45
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT2_OUT	0xB46
-#define	MSR_UNC_SKX_II0_MCP0_BAND_PORT3_OUT	0xB47
-
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT0_IN	0xB50
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT1_IN	0xB51
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT2_IN	0xB52
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT3_IN	0xB53
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT0_OUT	0xB54
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT1_OUT	0xB55
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT2_OUT	0xB56
-#define	MSR_UNC_SKX_II0_MCP1_BAND_PORT3_OUT	0xB57
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT0_IN    0xB10
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT1_IN    0xB11
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT2_IN    0xB12
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT3_IN    0xB13
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT0_OUT    0xB14
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT1_OUT    0xB15
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT2_OUT    0xB16
+#define    MSR_UNC_SKX_II0_PCIE0_BAND_PORT3_OUT    0xB17
+
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT0_IN    0xB20
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT1_IN    0xB21
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT2_IN    0xB22
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT3_IN    0xB23
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT0_OUT    0xB24
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT1_OUT    0xB25
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT2_OUT    0xB26
+#define    MSR_UNC_SKX_II0_PCIE1_BAND_PORT3_OUT    0xB27
+
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT0_IN    0xB30
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT1_IN    0xB31
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT2_IN    0xB32
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT3_IN    0xB33
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT0_OUT    0xB34
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT1_OUT    0xB35
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT2_OUT    0xB36
+#define    MSR_UNC_SKX_II0_PCIE2_BAND_PORT3_OUT    0xB37
+
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT0_IN    0xB40
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT1_IN    0xB41
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT2_IN    0xB42
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT3_IN    0xB43
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT0_OUT    0xB44
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT1_OUT    0xB45
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT2_OUT    0xB46
+#define    MSR_UNC_SKX_II0_MCP0_BAND_PORT3_OUT    0xB47
+
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT0_IN    0xB50
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT1_IN    0xB51
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT2_IN    0xB52
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT3_IN    0xB53
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT0_OUT    0xB54
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT1_OUT    0xB55
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT2_OUT    0xB56
+#define    MSR_UNC_SKX_II0_MCP1_BAND_PORT3_OUT    0xB57
 
 /* SKX Free-Running IIO Utilization Counters */
 #define MSR_UNC_SKX_II0_CBDMA_UTIL_PORT0_IN      0xB08
@@ -1806,50 +1806,50 @@
 #define MSR_UNC_SKX_II0_CBDMA_UTIL_PORT2_OUT     0xB0D
 #define MSR_UNC_SKX_II0_CBDMA_UTIL_PORT3_OUT     0xB0F
 
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT0_IN	0xB18
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT1_IN	0xB1A
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT2_IN	0xB1C
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT3_IN	0xB1E
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT0_OUT	0xB19
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT1_OUT	0xB1B
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT2_OUT	0xB1D
-#define	MSR_UNC_SKX_II0_PCIE0_UTIL_PORT3_OUT	0xB1F
-
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT0_IN	0xB28
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT1_IN	0xB2A
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT2_IN	0xB2C
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT3_IN	0xB2E
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT0_OUT	0xB29
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT1_OUT	0xB2B
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT2_OUT	0xB2D
-#define	MSR_UNC_SKX_II0_PCIE1_UTIL_PORT3_OUT	0xB2F
-
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT0_IN	0xB38
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT1_IN	0xB3A
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT2_IN	0xB3C
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT3_IN	0xB3E
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT0_OUT	0xB39
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT1_OUT	0xB3B
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT2_OUT	0xB3D
-#define	MSR_UNC_SKX_II0_PCIE2_UTIL_PORT3_OUT	0xB3F
-
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT0_IN	0xB48
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT1_IN	0xB4A
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT2_IN	0xB4C
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT3_IN	0xB4E
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT0_OUT	0xB49
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT1_OUT	0xB4B
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT2_OUT	0xB4D
-#define	MSR_UNC_SKX_II0_MCP0_UTIL_PORT3_OUT	0xB4F
-
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT0_IN	0xB58
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT1_IN	0xB5A
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT2_IN	0xB5C
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT3_IN	0xB5E
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT0_OUT	0xB59
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT1_OUT	0xB5B
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT2_OUT	0xB5D
-#define	MSR_UNC_SKX_II0_MCP1_UTIL_PORT3_OUT	0xB5F
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT0_IN    0xB18
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT1_IN    0xB1A
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT2_IN    0xB1C
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT3_IN    0xB1E
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT0_OUT    0xB19
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT1_OUT    0xB1B
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT2_OUT    0xB1D
+#define    MSR_UNC_SKX_II0_PCIE0_UTIL_PORT3_OUT    0xB1F
+
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT0_IN    0xB28
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT1_IN    0xB2A
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT2_IN    0xB2C
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT3_IN    0xB2E
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT0_OUT    0xB29
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT1_OUT    0xB2B
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT2_OUT    0xB2D
+#define    MSR_UNC_SKX_II0_PCIE1_UTIL_PORT3_OUT    0xB2F
+
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT0_IN    0xB38
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT1_IN    0xB3A
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT2_IN    0xB3C
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT3_IN    0xB3E
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT0_OUT    0xB39
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT1_OUT    0xB3B
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT2_OUT    0xB3D
+#define    MSR_UNC_SKX_II0_PCIE2_UTIL_PORT3_OUT    0xB3F
+
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT0_IN    0xB48
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT1_IN    0xB4A
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT2_IN    0xB4C
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT3_IN    0xB4E
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT0_OUT    0xB49
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT1_OUT    0xB4B
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT2_OUT    0xB4D
+#define    MSR_UNC_SKX_II0_MCP0_UTIL_PORT3_OUT    0xB4F
+
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT0_IN    0xB58
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT1_IN    0xB5A
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT2_IN    0xB5C
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT3_IN    0xB5E
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT0_OUT    0xB59
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT1_OUT    0xB5B
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT2_OUT    0xB5D
+#define    MSR_UNC_SKX_II0_MCP1_UTIL_PORT3_OUT    0xB5F
 
 
 
@@ -2335,43 +2335,60 @@
 
 /* AMD 0x17 (Zen) */
 
-#define MSR_AMD17_PERFEVTSEL0		0xC0010200
-#define MSR_AMD17_PMC0			    0xC0010201
-#define MSR_AMD17_PERFEVTSEL1		0xC0010202
-#define MSR_AMD17_PMC1			    0xC0010203
-#define MSR_AMD17_PERFEVTSEL2		0xC0010204
-#define MSR_AMD17_PMC2			    0xC0010205
-#define MSR_AMD17_PERFEVTSEL3		0xC0010206
-#define MSR_AMD17_PMC3			    0xC0010207
-
-#define MSR_AMD17_L3_PERFEVTSEL0		0xC0010230
-#define MSR_AMD17_L3_PMC0		        0xC0010231
-#define MSR_AMD17_L3_PERFEVTSEL1		0xC0010232
-#define MSR_AMD17_L3_PMC1		        0xC0010233
-#define MSR_AMD17_L3_PERFEVTSEL2		0xC0010234
-#define MSR_AMD17_L3_PMC2		        0xC0010235
-#define MSR_AMD17_L3_PERFEVTSEL3		0xC0010236
-#define MSR_AMD17_L3_PMC3		        0xC0010237
-#define MSR_AMD17_L3_PERFEVTSEL4		0xC0010238
-#define MSR_AMD17_L3_PMC4		        0xC0010239
-#define MSR_AMD17_L3_PERFEVTSEL5		0xC001023A
-#define MSR_AMD17_L3_PMC5		        0xC001023B
-
-#define MSR_AMD17_HW_CONFIG			    0xC0010015
-#define MSR_AMD17_SYS_CONFIG			0xC0010010
-
-#define MSR_AMD17_RO_INST_RETIRED_CTR	0xC00000E9
-#define MSR_AMD17_RO_APERF				0xC00000E8
-#define MSR_AMD17_RO_MPERF			    0xC00000E7
-#define MSR_AMD17_INST_RETIRED_CTR		0x000000E9
-#define MSR_AMD17_APERF				    0x000000E8
-#define MSR_AMD17_MPERF				    0x000000E7
-
-#define MSR_AMD17_FEATURE_ENABLE		0xC0000080
-
-#define MSR_AMD17_RAPL_POWER_UNIT		0xC0010299
-#define MSR_AMD17_RAPL_CORE_STATUS		0xC001029A
-#define MSR_AMD17_RAPL_PKG_STATUS		0xC001029B
+#define MSR_AMD17_PERFEVTSEL0        0xC0010200
+#define MSR_AMD17_PMC0               0xC0010201
+#define MSR_AMD17_PERFEVTSEL1        0xC0010202
+#define MSR_AMD17_PMC1               0xC0010203
+#define MSR_AMD17_PERFEVTSEL2        0xC0010204
+#define MSR_AMD17_PMC2               0xC0010205
+#define MSR_AMD17_PERFEVTSEL3        0xC0010206
+#define MSR_AMD17_PMC3               0xC0010207
+
+#define MSR_AMD17_L3_PERFEVTSEL0        0xC0010230
+#define MSR_AMD17_L3_PMC0               0xC0010231
+#define MSR_AMD17_L3_PERFEVTSEL1        0xC0010232
+#define MSR_AMD17_L3_PMC1               0xC0010233
+#define MSR_AMD17_L3_PERFEVTSEL2        0xC0010234
+#define MSR_AMD17_L3_PMC2               0xC0010235
+#define MSR_AMD17_L3_PERFEVTSEL3        0xC0010236
+#define MSR_AMD17_L3_PMC3               0xC0010237
+#define MSR_AMD17_L3_PERFEVTSEL4        0xC0010238
+#define MSR_AMD17_L3_PMC4               0xC0010239
+#define MSR_AMD17_L3_PERFEVTSEL5        0xC001023A
+#define MSR_AMD17_L3_PMC5               0xC001023B
+
+#define MSR_AMD17_HW_CONFIG             0xC0010015
+#define MSR_AMD17_SYS_CONFIG            0xC0010010
+
+#define MSR_AMD17_RO_INST_RETIRED_CTR   0xC00000E9
+#define MSR_AMD17_RO_APERF              0xC00000E8
+#define MSR_AMD17_RO_MPERF              0xC00000E7
+#define MSR_AMD17_INST_RETIRED_CTR      0x000000E9
+#define MSR_AMD17_APERF                 0x000000E8
+#define MSR_AMD17_MPERF                 0x000000E7
+
+#define MSR_AMD17_FEATURE_ENABLE        0xC0000080
+
+#define MSR_AMD17_RAPL_POWER_UNIT       0xC0010299
+#define MSR_AMD17_RAPL_CORE_STATUS      0xC001029A
+#define MSR_AMD17_RAPL_PKG_STATUS       0xC001029B
+
+/* AMD 0x17 Models 0x01 (Zen2) additional to Zen regs */
+
+#define MSR_AMD17_2_PERFEVTSEL4        0xC0010208
+#define MSR_AMD17_2_PMC4               0xC0010209
+#define MSR_AMD17_2_PERFEVTSEL5        0xC001020A
+#define MSR_AMD17_2_PMC5               0xC001020B
+
+#define MSR_AMD17_2_DF_PERFEVTSEL0        0xC0010240
+#define MSR_AMD17_2_DF_PMC0               0xC0010241
+#define MSR_AMD17_2_DF_PERFEVTSEL1        0xC0010242
+#define MSR_AMD17_2_DF_PMC1               0xC0010243
+#define MSR_AMD17_2_DF_PERFEVTSEL2        0xC0010244
+#define MSR_AMD17_2_DF_PMC2               0xC0010245
+#define MSR_AMD17_2_DF_PERFEVTSEL3        0xC0010246
+#define MSR_AMD17_2_DF_PMC3               0xC0010247
+
 /* ARM Cortex A15 */
 #define A15_PMC0                        0x0000
 #define A15_PMC1                        0x0004
diff --git a/src/includes/registers_types.h b/src/includes/registers_types.h
index e893ee490..aa6b4ea56 100644
--- a/src/includes/registers_types.h
+++ b/src/includes/registers_types.h
@@ -163,15 +163,24 @@ static char* RegisterTypeNames[MAX_UNITS] = {
     [MBOX5FIX] = "Memory Controller 1 Channel 1 Fixed Counter",
     [MBOX6FIX] = "Memory Controller 1 Channel 2 Fixed Counter",
     [MBOX7FIX] = "Memory Controller 1 Channel 3 Fixed Counter",
+#ifdef _ARCH_PPC
+    [BBOX0] = "Memory controller synchronous (port 0 & 1)",
+    [BBOX1] = "Memory controller synchronous (port 2 & 3)",
+#else
     [BBOX0] = "Home Agent box 0",
     [BBOX1] = "Home Agent box 1",
+#endif
     [RBOX0] = "Routing box 0",
     [RBOX1] = "Routing box 1",
     [RBOX2] = "Routing box 2",
     [WBOX] = "Power control box",
     [WBOX0FIX] = "Power control box fixed counter 0",
     [WBOX1FIX] = "Power control box fixed counter 1",
+#ifdef _ARCH_PPC
+    [SBOX0] = "PowerBus",
+#else
     [SBOX0] = "QPI Link Layer box 0",
+#endif
     [SBOX1] = "QPI Link Layer box 1",
     [SBOX2] = "QPI Link Layer box 2",
     [SBOX3] = "QPI Link Layer box 3",
@@ -229,8 +238,15 @@ static char* RegisterTypeNames[MAX_UNITS] = {
     [IBOX3] = "Coherency Maintainer for IIO traffic",
     [IBOX4] = "Coherency Maintainer for IIO traffic",
     [IBOX5] = "Coherency Maintainer for IIO traffic",
+#ifdef _ARCH_PPC
+    [QBOX0] = "Xlink 0",
+    [QBOX1] = "Xlink 1",
+    [QBOX2] = "Xlink 2",
+#else
     [QBOX0] = "QPI Link Layer 0",
     [QBOX1] = "QPI Link Layer 1",
+    [QBOX2] = "QPI Link Layer 2",
+#endif
     [QBOX0FIX] = "QPI Link Layer rate status 0",
     [QBOX1FIX] = "QPI Link Layer rate status 1",
     [EUBOX0] = "Embedded DRAM controller 0",
diff --git a/src/includes/topology.h b/src/includes/topology.h
index de100fa0a..06b4865a1 100644
--- a/src/includes/topology.h
+++ b/src/includes/topology.h
@@ -44,7 +44,7 @@
 #include <types.h>
 #include <tree.h>
 
-#define MAX_FEATURE_STRING_LENGTH 512
+#define MAX_FEATURE_STRING_LENGTH 512 /* only used in deprecated topology_cpuid.c */
 #define MAX_MODEL_STRING_LENGTH 512
 
 struct topology_functions {
@@ -130,6 +130,7 @@ struct topology_functions {
 #define ATHLON64_G1     0x6FU
 #define ATHLON64_G2     0x7FU
 #define ZEN_RYZEN       0x01
+#define ZEN2_RYZEN      0x31
 
 /* ARM */
 #define  ARM7L          0x3U
@@ -161,7 +162,10 @@ struct topology_functions {
 #define MARVELL		0x56U
 #define INTEL_ARM	0x69U
 
-
+/* POWER */
+#define POWER7          0x7U
+#define POWER8          0x8U
+#define POWER9          0x9U
 
 #define  P6_FAMILY        0x6U
 #define  MIC_FAMILY       0xBU
@@ -173,6 +177,7 @@ struct topology_functions {
 #define  K8_FAMILY        0xFU
 #define  ARMV7_FAMILY     0x7U
 #define  ARMV8_FAMILY     0x8U
+#define  PPC_FAMILY       0x42U
 
 extern int cpu_count(cpu_set_t* set);
 
diff --git a/src/includes/topology_types.h b/src/includes/topology_types.h
index 6070c9ad8..367edcd16 100644
--- a/src/includes/topology_types.h
+++ b/src/includes/topology_types.h
@@ -60,6 +60,7 @@ typedef enum {
     SSSE3, /*!< \brief Supplemental Streaming SIMD Extensions 3 */
     SSE41, /*!< \brief Streaming SIMD Extensions 4.1 */
     SSE42, /*!< \brief Streaming SIMD Extensions 4.2 */
+    SSE4A, /*!< \brief Streaming SIMD Extensions 4A */
     AVX, /*!< \brief Advanced Vector Extensions */
     FMA, /*!< \brief Fused multiply-add (FMA3) */
     AVX2, /*!< \brief Advanced Vector Extensions 2 */
@@ -75,6 +76,10 @@ typedef enum {
     VFPV3, /*!< \brief First generation SIMD Version 3 */
     VFPV4, /*!< \brief First generation SIMD Version 4 */
     TLS, /*!< \brief Thread-local storage registers */
+    FP, /*!< \brief Floating-point support */
+    ASIMD, /*!< \brief ARMv8 Advanced SIMD */
+    ASIMDRDM, /*!< \brief ARMv8 Advanced SIMD: Rounding double multiply */
+    PMULL, /*!< \brief ARMv8 Advanced SIMD: Polynomial multiply long */
 } FeatureBit;
 /** @}*/
 #endif /*CPUID_TYPES_H*/
diff --git a/src/libperfctr.c b/src/libperfctr.c
index e129517fe..c3200a70d 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -153,6 +153,8 @@ likwid_markerInit(void)
     char* cThreadStr = getenv("LIKWID_THREADS");
     char* filepath = getenv("LIKWID_FILEPATH");
     char* perfpid = getenv("LIKWID_PERF_EXECPID");
+    char* debugStr = getenv("LIKWID_DEBUG");
+    char* pinStr = getenv("LIKWID_PIN");
     char execpid[20];
     /* Dirty hack to avoid nonnull warnings */
     int (*ownatoi)(const char*);
@@ -183,12 +185,12 @@ likwid_markerInit(void)
     affinity_init();
     hashTable_init();
 
-//#ifndef LIKWID_USE_PERFEVENT
+#ifndef LIKWID_USE_PERFEVENT
     HPMmode(atoi(modeStr));
-//#endif
-    if (getenv("LIKWID_DEBUG") != NULL)
+#endif
+    if (debugStr != NULL)
     {
-        perfmon_verbosity = atoi(getenv("LIKWID_DEBUG"));
+        perfmon_verbosity = atoi(debugStr);
         verbosity = perfmon_verbosity;
     }
 
@@ -202,7 +204,7 @@ likwid_markerInit(void)
     bdestroy(bThreadStr);
     bstrListDestroy(threadTokens);
 
-    if (getenv("LIKWID_PIN") != NULL)
+    if (pinStr != NULL)
     {
         likwid_pinThread(threads2Cpu[0]);
         if (getenv("OMP_NUM_THREADS") != NULL)
@@ -288,6 +290,7 @@ likwid_markerThreadInit(void)
     {
         return;
     }
+    char* pinStr = getenv("LIKWID_PIN");
 
     pthread_mutex_lock(&globalLock);
     t = pthread_self();
@@ -305,7 +308,7 @@ likwid_markerThreadInit(void)
     }
     pthread_mutex_unlock(&globalLock);
 
-    if (getenv("LIKWID_PIN") != NULL)
+    if (pinStr != NULL)
     {
         cpu_set_t cpuset;
         CPU_ZERO(&cpuset);
@@ -458,17 +461,9 @@ likwid_markerClose(void)
         fprintf(stderr, "%s", strerror(errno));
     }
     if (validRegions)
+    {
         free(validRegions);
-}
-
-void __attribute__((destructor (101))) likwid_markerCloseDestruct(void)
-{
-    LikwidResults* results = NULL;
-    int numberOfThreads = 0;
-    int numberOfRegions = 0;
-    if (!likwid_init)
-        return;
-    hashTable_finalize(&numberOfThreads, &numberOfRegions, &results);
+    }
     if ((numberOfThreads == 0)||(numberOfThreads == 0))
     {
         return;
@@ -485,7 +480,6 @@ void __attribute__((destructor (101))) likwid_markerCloseDestruct(void)
         free(results[i].cpulist);
         free(results[i].counters);
     }
-
     if (results != NULL)
     {
         free(results);
@@ -793,4 +787,3 @@ likwid_pinProcess(int processorId)
 
     return TRUE;
 }
-
diff --git a/src/luawid.c b/src/luawid.c
index bab79cad7..a66da744a 100644
--- a/src/luawid.c
+++ b/src/luawid.c
@@ -735,6 +735,9 @@ lua_likwid_getCpuInfo(lua_State* L)
     lua_pushstring(L,"features");
     lua_pushstring(L,cpuinfo->features);
     lua_settable(L,-3);
+    lua_pushstring(L,"architecture");
+    lua_pushstring(L,cpuinfo->architecture);
+    lua_settable(L,-3);
     lua_pushstring(L,"isIntel");
     lua_pushinteger(L,cpuinfo->isIntel);
     lua_settable(L,-3);
@@ -1245,6 +1248,31 @@ lua_likwid_setMemInterleaved(lua_State* L)
     return 0;
 }
 
+static int
+lua_likwid_setMembind(lua_State* L)
+{
+    int ret;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (ret = 1; ret<=nrThreads; ret++)
+    {
+        lua_rawgeti(L,-1,ret);
+#if LUA_VERSION_NUM == 501
+        cpus[ret-1] = ((lua_Integer)lua_tointeger(L,-1));
+#else
+        cpus[ret-1] = ((lua_Unsigned)lua_tointegerx(L,-1, NULL));
+#endif
+        lua_pop(L,1);
+    }
+    numa_setMembind(cpus, nrThreads);
+    return 0;
+}
+
 static int
 lua_likwid_getAffinityInfo(lua_State* L)
 {
@@ -2292,7 +2320,7 @@ static int
 lua_likwid_getRegion(lua_State* L)
 {
     int i = 0;
-    const char* tag = (const char*)luaL_checkstring(L, -2);
+    const char* tag = (const char*)luaL_checkstring(L, -1);
     int nr_events = perfmon_getNumberOfEvents(perfmon_getIdOfActiveGroup());
     double* events = NULL;
     double time = 0.0;
@@ -2322,6 +2350,13 @@ lua_likwid_getRegion(lua_State* L)
     return 4;
 }
 
+static int
+lua_likwid_resetRegion(lua_State* L)
+{
+    const char* tag = (const char*)luaL_checkstring(L, -1);
+    lua_pushinteger(L, likwid_markerResetRegion(tag));
+}
+
 static int
 lua_likwid_cpuFeatures_init(lua_State* L)
 {
@@ -2519,6 +2554,20 @@ lua_likwid_markerRegionMetric(lua_State* L)
     return 1;
 }
 
+static int
+lua_likwid_initFreq(lua_State* L)
+{
+    lua_pushnumber(L, freq_init());
+    return 1;
+}
+
+static int
+lua_likwid_finalizeFreq(lua_State* L)
+{
+    freq_finalize();
+    return 0;
+}
+
 static int
 lua_likwid_getCpuClockCurrent(lua_State* L)
 {
@@ -2535,6 +2584,14 @@ lua_likwid_getCpuClockMin(lua_State* L)
     return 1;
 }
 
+static int
+lua_likwid_getConfCpuClockMin(lua_State* L)
+{
+    const int cpu_id = lua_tointeger(L,-1);
+    lua_pushnumber(L, freq_getConfCpuClockMin(cpu_id));
+    return 1;
+}
+
 static int
 lua_likwid_setCpuClockMin(lua_State* L)
 {
@@ -2552,6 +2609,14 @@ lua_likwid_getCpuClockMax(lua_State* L)
     return 1;
 }
 
+static int
+lua_likwid_getConfCpuClockMax(lua_State* L)
+{
+    const int cpu_id = lua_tointeger(L,-1);
+    lua_pushnumber(L, freq_getConfCpuClockMax(cpu_id));
+    return 1;
+}
+
 static int
 lua_likwid_setCpuClockMax(lua_State* L)
 {
@@ -2623,7 +2688,10 @@ lua_likwid_getAvailGovs(lua_State* L)
     const int cpu_id = lua_tointeger(L,-1);
     char* avail = freq_getAvailGovs(cpu_id);
     if (avail)
+    {
         lua_pushstring(L, avail);
+        free(avail);
+    }
     else
         lua_pushnil(L);
     return 1;
@@ -3393,6 +3461,7 @@ luaopen_liblikwid(lua_State* L){
     lua_register(L, "likwid_getNumaInfo",lua_likwid_getNumaInfo);
     lua_register(L, "likwid_putNumaInfo",lua_likwid_putNumaInfo);
     lua_register(L, "likwid_setMemInterleaved", lua_likwid_setMemInterleaved);
+    lua_register(L, "likwid_setMembind", lua_likwid_setMembind);
     lua_register(L, "likwid_getAffinityInfo",lua_likwid_getAffinityInfo);
     lua_register(L, "likwid_putAffinityInfo",lua_likwid_putAffinityInfo);
     lua_register(L, "likwid_getPowerInfo",lua_likwid_getPowerInfo);
@@ -3451,6 +3520,7 @@ luaopen_liblikwid(lua_State* L){
     lua_register(L, "likwid_startRegion", lua_likwid_startRegion);
     lua_register(L, "likwid_stopRegion", lua_likwid_stopRegion);
     lua_register(L, "likwid_getRegion", lua_likwid_getRegion);
+    lua_register(L, "likwid_resetRegion", lua_likwid_resetRegion);
     // CPU feature manipulation functions
     lua_register(L, "likwid_cpuFeaturesInit", lua_likwid_cpuFeatures_init);
     lua_register(L, "likwid_cpuFeaturesGet", lua_likwid_cpuFeatures_get);
@@ -3470,10 +3540,14 @@ luaopen_liblikwid(lua_State* L){
     lua_register(L, "likwid_markerRegionResult", lua_likwid_markerRegionResult);
     lua_register(L, "likwid_markerRegionMetric", lua_likwid_markerRegionMetric);
     // CPU frequency functions
+    lua_register(L, "likwid_initFreq", lua_likwid_initFreq);
+    lua_register(L, "likwid_finalizeFreq", lua_likwid_finalizeFreq);
     lua_register(L, "likwid_getCpuClockCurrent", lua_likwid_getCpuClockCurrent);
     lua_register(L, "likwid_getCpuClockMin", lua_likwid_getCpuClockMin);
+    lua_register(L, "likwid_getConfCpuClockMin", lua_likwid_getConfCpuClockMin);
     lua_register(L, "likwid_setCpuClockMin", lua_likwid_setCpuClockMin);
     lua_register(L, "likwid_getCpuClockMax", lua_likwid_getCpuClockMax);
+    lua_register(L, "likwid_getConfCpuClockMax", lua_likwid_getConfCpuClockMax);
     lua_register(L, "likwid_setCpuClockMax", lua_likwid_setCpuClockMax);
     lua_register(L, "likwid_getGovernor", lua_likwid_getGovernor);
     lua_register(L, "likwid_setGovernor", lua_likwid_setGovernor);
diff --git a/src/numa.c b/src/numa.c
index 64d911285..6a8452959 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -112,6 +112,15 @@ empty_numa_setInterleaved(const int* processorList, int numberOfProcessors)
     return;
 }
 
+void
+empty_numa_setMembind(const int* processorList, int numberOfProcessors)
+{
+    printf("MEMPOLICY NOT supported in kernel!\n");
+    if (!processorList || numberOfProcessors < 0)
+        printf("Invalid options\n");
+    return;
+}
+
 void
 empty_numa_membind(void* ptr, size_t size, int domainId)
 {
@@ -125,10 +134,11 @@ const struct numa_functions numa_funcs = {
 #ifndef HAS_MEMPOLICY
     .numa_init = empty_numa_init,
     .numa_setInterleaved = empty_numa_setInterleaved,
-    .numa_membind = empty_numa_membind
+    .numa_membind = empty_numa_membind,
+    .numa_setMembind = empty_numa_setMembind,
 #else
 #ifdef LIKWID_USE_HWLOC
-#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)
+#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) || defined(_ARCH_PPC64)
     .numa_init = proc_numa_init,
 #else
     .numa_init = hwloc_numa_init,
@@ -137,7 +147,8 @@ const struct numa_functions numa_funcs = {
     .numa_init = proc_numa_init,
 #endif
     .numa_setInterleaved = proc_numa_setInterleaved,
-    .numa_membind = proc_numa_membind
+    .numa_membind = proc_numa_membind,
+    .numa_setMembind = proc_numa_setMembind,
 #endif
 };
 
@@ -166,7 +177,8 @@ numa_init(void)
         cpu_set_t cpuSet;
         CPU_ZERO(&cpuSet);
         sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
-        if (cpuid_topology.activeHWThreads < cpuid_topology.numHWThreads)
+        if (cpuid_topology.activeHWThreads < cpuid_topology.numHWThreads &&
+            getenv("HWLOC_FSROOT") == NULL)
         {
             ret = proc_numa_init();
         }
@@ -196,6 +208,14 @@ numa_membind(void* ptr, size_t size, int domainId)
     return;
 }
 
+void
+numa_setMembind(const int* processorList, int numberOfProcessors)
+{
+    const struct numa_functions funcs = numa_funcs;
+    funcs.numa_setMembind(processorList, numberOfProcessors);
+    return;
+}
+
 #ifndef HAS_MEMPOLICY
 void
 numa_finalize(void)
diff --git a/src/numa_hwloc.c b/src/numa_hwloc.c
index 99b2dcc7a..56feb7a47 100644
--- a/src/numa_hwloc.c
+++ b/src/numa_hwloc.c
@@ -216,19 +216,30 @@ hwloc_numa_init(void)
 {
     int errno;
     uint32_t i;
-    int d;
+    int d, j;
     int depth;
     int cores_per_socket;
     int numPUs = 0;
     hwloc_obj_t obj;
     const struct hwloc_distances_s* distances;
+    struct hwloc_distances_s* dists = NULL;
+    unsigned dist_count = 1;
+#if HWLOC_API_VERSION > 0x00020000
+    hwloc_obj_type_t hwloc_type = HWLOC_OBJ_NUMANODE;
+#else
     hwloc_obj_type_t hwloc_type = HWLOC_OBJ_NODE;
+#endif
     if (numaInitialized > 0 || numa_info.numberOfNodes > 0)
         return 0;
 
     if (!hwloc_topology)
     {
         likwid_hwloc_topology_init(&hwloc_topology);
+#if HWLOC_API_VERSION > 0x00020000
+        likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM );
+#else
+        likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM|HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+#endif
         likwid_hwloc_topology_load(hwloc_topology);
     }
 #if defined(__x86_64) || defined(__i386__)
@@ -243,7 +254,11 @@ hwloc_numa_init(void)
     if (numa_info.numberOfNodes == 0)
     {
 #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__)
+#if HWLOC_API_VERSION > 0x00020000
+        hwloc_type = HWLOC_OBJ_NUMANODE;
+#else
         hwloc_type = HWLOC_OBJ_NODE;
+#endif
 #else
         hwloc_type = HWLOC_OBJ_SOCKET;
 #endif
@@ -296,13 +311,32 @@ hwloc_numa_init(void)
             return -1;
         }
         depth = likwid_hwloc_get_type_depth(hwloc_topology, hwloc_type);
+#if HWLOC_API_VERSION > 0x00020000
+        errno = likwid_hwloc_distances_get_by_type(hwloc_topology, hwloc_type, &dist_count, &dists, HWLOC_DISTANCES_KIND_FROM_OS, 0);
+#else
         distances = likwid_hwloc_get_whole_distance_matrix_by_type(hwloc_topology, hwloc_type);
+#endif
         for (i=0; i<numa_info.numberOfNodes; i++)
         {
             obj = likwid_hwloc_get_obj_by_depth(hwloc_topology, depth, i);
 
             numa_info.nodes[i].id = obj->os_index;
-
+#if HWLOC_API_VERSION > 0x00020000
+            if (obj->attr->numanode.local_memory != 0)
+            {
+                numa_info.nodes[i].totalMemory = (uint64_t)(obj->attr->numanode.local_memory/1024);
+            }
+            else if (obj->attr->numanode.page_types_len != 0)
+            {
+                numa_info.nodes[i].totalMemory = 0;
+                for (int k = 0; k < obj->attr->numanode.page_types_len; k++)
+                {
+                    uint64_t size = obj->attr->numanode.page_types[k].size;
+                    uint64_t count = obj->attr->numanode.page_types[k].count;
+                    numa_info.nodes[i].totalMemory += (uint64_t)((size*count)/1024);
+                }
+            }
+#else
             if (obj->memory.local_memory != 0)
             {
                 numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.local_memory/1024);
@@ -311,6 +345,7 @@ hwloc_numa_init(void)
             {
                 numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.total_memory/1024);
             }
+#endif
             else
             {
                 numa_info.nodes[i].totalMemory = getTotalNodeMem(numa_info.nodes[i].id);
@@ -325,8 +360,23 @@ hwloc_numa_init(void)
                 return -1;
             }
             d = 0;
+            j = 0;
+            // call before hwloc update
+
+#if HWLOC_API_VERSION > 0x00020000
+            for (d = 0; d < cpuid_topology.numHWThreads; d++)
+            {
+                if (likwid_hwloc_bitmap_isset(obj->cpuset, d))
+                {
+                    numa_info.nodes[i].processors[j] = d;
+                    j++;
+                }
+            }
+            numa_info.nodes[i].numberOfProcessors = j;
+#else
             numa_info.nodes[i].numberOfProcessors = likwid_hwloc_record_objs_of_type_below_obj(
-                    hwloc_topology, obj, HWLOC_OBJ_PU, &d, &numa_info.nodes[i].processors);
+                        hwloc_topology, obj, HWLOC_OBJ_PU, &d, &numa_info.nodes[i].processors);
+#endif
             numa_info.nodes[i].distances = (uint32_t*) malloc(numa_info.numberOfNodes * sizeof(uint32_t));
             if (!numa_info.nodes[i].distances)
             {
@@ -334,6 +384,19 @@ hwloc_numa_init(void)
                         numa_info.numberOfNodes*sizeof(uint32_t),i);
                 return -1;
             }
+#if HWLOC_API_VERSION > 0x00020000
+            if (dists)
+            {
+                int base = hwloc_distances_obj_index(dists, obj);
+                for (d = 0; d < dists->nbobjs; d++)
+                {
+                    hwloc_obj_t dobj = dists->objs[d];
+                    int idx = hwloc_distances_obj_index(dists, dobj);
+                    numa_info.nodes[i].distances[idx] = dists->values[(base*dists->nbobjs)+idx];
+                }
+                numa_info.nodes[i].numberOfDistances = numa_info.numberOfNodes;
+            }
+#else
             if (distances)
             {
                 numa_info.nodes[i].numberOfDistances = distances->nbobjs;
@@ -342,16 +405,22 @@ hwloc_numa_init(void)
                     numa_info.nodes[i].distances[d] = distances->latency[i*distances->nbobjs + d] * distances->latency_base;
                 }
             }
+#endif
             else
             {
                 numa_info.nodes[i].numberOfDistances = numa_info.numberOfNodes;
-                for(d=0;d<numa_info.numberOfNodes;d++)
+                for(d = 0; d < numa_info.numberOfNodes; d++)
                 {
                     numa_info.nodes[i].distances[d] = 10;
                 }
             }
-
         }
+#if HWLOC_API_VERSION > 0x00020000
+        if (dists)
+        {
+            hwloc_distances_release_remove(hwloc_topology, dists);
+        }
+#endif
     }
 
     if (numa_info.nodes[0].numberOfProcessors == 0)
@@ -369,11 +438,17 @@ void
 hwloc_numa_membind(void* ptr, size_t size, int domainId)
 {
     int ret = 0;
-    hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+    if (!ptr || size == 0 || domainId < 0 || domainId >= numa_info.numberOfNodes)
+        return;
+    hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS|HWLOC_MEMBIND_BYNODESET;
     hwloc_nodeset_t nodeset = likwid_hwloc_bitmap_alloc();
     likwid_hwloc_bitmap_zero(nodeset);
     likwid_hwloc_bitmap_set(nodeset, domainId);
+#if HWLOC_API_VERSION > 0x00020000
+    ret = likwid_hwloc_set_area_membind(hwloc_topology, ptr, size, nodeset, HWLOC_MEMBIND_BIND, flags);
+#else
     ret = likwid_hwloc_set_area_membind_nodeset(hwloc_topology, ptr, size, nodeset, HWLOC_MEMBIND_BIND, flags);
+#endif
     likwid_hwloc_bitmap_free(nodeset);
 
     if (ret < 0)
diff --git a/src/numa_proc.c b/src/numa_proc.c
index be57358dd..8f681585b 100644
--- a/src/numa_proc.c
+++ b/src/numa_proc.c
@@ -106,6 +106,55 @@ setConfiguredNodes(void)
     return maxIdConfiguredNode;
 }
 
+static int
+get_numaNodes(int* array, int maxlen)
+{
+    DIR *dir = NULL;
+    struct dirent *de = NULL;
+    int count = 0;
+
+    dir = opendir("/sys/devices/system/node");
+
+    if (!dir)
+    {
+        count = 0;
+    }
+    else
+    {
+        while ((de = readdir(dir)) != NULL)
+        {
+            if (strncmp(de->d_name, "node", 4))
+            {
+                continue;
+            }
+	    if (array && count < maxlen)
+            {
+            	int nd = str2int(de->d_name+4);
+                array[count] = nd;
+	    }
+            count++;
+        }
+    }       
+    if (array && count > 0)
+    {
+        int i = 0;
+        int j = 0;
+        while (i < count)
+        {
+            j = i;
+            while (j > 0 && array[j-1] > array[j])
+            {
+                int tmp = array[j];
+                array[j] = array[j-1];
+                array[j-1] = tmp;
+                j--;
+            }
+            i++;
+        }
+    }
+    return count;
+}
+
 static void
 nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
 {
@@ -116,7 +165,6 @@ nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
     int i;
 
     filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
-
     if (NULL != (fp = fopen (bdata(filename), "r")))
     {
         bstring src = bread ((bNread) fread, fp);
@@ -216,7 +264,7 @@ nodeProcessorList(int node, uint32_t** list)
                         }
                         else
                         {
-                            ERROR_PRINT(Number Of threads %d too large,count);
+                            ERROR_PRINT(Number Of threads %d too large for NUMA node %d, count, node);
                             return -EFAULT;
                         }
                         count++;
@@ -305,7 +353,14 @@ int proc_numa_init(void)
         return -1;
     }
     /* First determine maximum number of nodes */
-    numa_info.numberOfNodes = setConfiguredNodes()+1;
+    //numa_info.numberOfNodes = setConfiguredNodes()+1;
+    numa_info.numberOfNodes= get_numaNodes(NULL, 10000);
+    int* nodes = malloc(numa_info.numberOfNodes*sizeof(int));
+    if (!nodes)
+    {
+	return -ENOMEM;
+    }
+    numa_info.numberOfNodes = get_numaNodes(nodes, numa_info.numberOfNodes);
     numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
     if (!numa_info.nodes)
     {
@@ -314,16 +369,17 @@ int proc_numa_init(void)
 
     for (i=0; i<numa_info.numberOfNodes; i++)
     {
-        numa_info.nodes[i].id = i;
-        nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
-        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
+	int id = nodes[i];
+        numa_info.nodes[i].id = id;
+        nodeMeminfo(id, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
+        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(id, &numa_info.nodes[i].processors);
         nrCPUs += numa_info.nodes[i].numberOfProcessors;
-        if (numa_info.nodes[i].numberOfProcessors == 0 && nrCPUs != cpuid_topology.activeHWThreads)
+        if (numa_info.nodes[i].numberOfProcessors == 0 && nrCPUs != cpuid_topology.numHWThreads)
         {
             err = -EFAULT;
             break;
         }
-        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+        numa_info.nodes[i].numberOfDistances = nodeDistanceList(id, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
         if (numa_info.nodes[i].numberOfDistances == 0)
         {
             err = -EFAULT;
@@ -332,8 +388,9 @@ int proc_numa_init(void)
     }
     for (; i<numa_info.numberOfNodes; i++)
     {
+	int id = nodes[i];
         numa_info.nodes[i].numberOfProcessors = 0;
-        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+        numa_info.nodes[i].numberOfDistances = nodeDistanceList(id, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
     }
     if (err == 0)
         numaInitialized = 1;
@@ -369,6 +426,35 @@ proc_numa_setInterleaved(const int* processorList, int numberOfProcessors)
     }
 }
 
+void
+proc_numa_setMembind(const int* processorList, int numberOfProcessors)
+{
+    long i;
+    int j;
+    int ret=0;
+    unsigned long numberOfNodes = 65;
+    unsigned long mask = 0UL;
+
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        for (j=0; j<numberOfProcessors; j++)
+        {
+            if (proc_findProcessor(i,processorList[j]))
+            {
+                mask |= (1UL<<i);
+                break;
+            }
+        }
+    }
+
+    ret = set_mempolicy(MPOL_BIND, &mask, numberOfNodes);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+
 void
 proc_numa_membind(void* ptr, size_t size, int domainId)
 {
diff --git a/src/pci_hwloc.c b/src/pci_hwloc.c
index 4abb9f8ab..17ad82e8a 100644
--- a/src/pci_hwloc.c
+++ b/src/pci_hwloc.c
@@ -60,7 +60,11 @@ hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
     if (!hwloc_topology)
     {
         likwid_hwloc_topology_init(&hwloc_topology);
+#if HWLOC_API_VERSION > 0x00020000
+        likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM );
+#else
         likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM|HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+#endif
         likwid_hwloc_topology_load(hwloc_topology);
     }
 
diff --git a/src/perfgroup.c b/src/perfgroup.c
index 17e1078bb..7bc5b331c 100644
--- a/src/perfgroup.c
+++ b/src/perfgroup.c
@@ -44,34 +44,13 @@
 
 #include <error.h>
 #include <perfgroup.h>
+#include <topology.h>
 #include <likwid.h>
 
-#include <lua.h>
-#include <lauxlib.h>
-#include <lualib.h>
-#include <math.h>
+#include <calculator.h>
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
 
-#define LUA_STATES_CLEAN_DEFAULT 100
-
-char* in_func_str = "require('math');function SUM(...);local s = 0;for k,v in pairs({...}) do;s = s + v;end;return s;end;function ARGS(...) return #arg end;function AVG(...) return SUM(...)/ARGS(...) end;MEAN = AVG;function MIN(...) return math.min(...) end;function MAX(...) return math.max(...) end;function MEDIAN(...);local x = {...};local l = ARGS(...);table.sort(x);return x[math.floor(l/2)];end;function PERCENTILE(perc, ...);local x = {...};local xlen = #x;table.sort(x);local idx = math.ceil((perc/100.0)*xlen);return x[idx];end;function IFELSE(cond, valid, invalid);if cond then;return valid;else;return invalid;end;end";
-char* in_expand_str = "function expand_str(x); hist = {}; runs = 0; for pref, elem in x:gmatch('([%a%d]*)%[([%d,]+)%]') do;  runs = runs + 1;  matches = {};  for num in elem:gmatch('%d+') do;   for _,v in pairs(varlist) do; if v:match(string.format('.*%s%s.*', pref, num)) then;  if not hist[v] then;   hist[v] = 1;  else;   hist[v] = hist[v] + 1; end; end; end;  end; end; res = {}; for k,v in pairs(hist) do;  if v == runs then table.insert(res, k) end; end; return table.concat(res, ',');end;;function expand_wildcard(x); cond = {}; newx = x:gsub('%*','[%%d]'); for _,v in pairs(varlist) do;  if v:match(newx) then;   table.insert(cond, v);  end; end; return table.concat(cond, ',');end;;function eval_str(s); repl = {}; for x in s:gmatch('([%a%d_]+%[[%d,]+%])') do;  repl[x:gsub('%[', '%%['):gsub('%]', '%%]')] = expand_str(x); end; for x in s:gmatch('[%a%d*%._]+') do;  if x:match('*') then;t = expand_wildcard(x); if t:len > 0 then repl[x..'*'] = t end; end; end; for k,v in pairs(repl) do;  s = s:gsub(k..'[%a%d_]*',v); end; return s;end;";
-char* in_user_func_str = NULL;
-
-char* not_allowed[] = {"io.", "popen(", "load", "get", "set", "call(", "require", "module", NULL};
-
-// Keep the Lua states per cpu
-lua_State** lua_states = NULL;
-// A clean counter is needed per cpu to close Lua state from time
-// to time since it grows with each calculation and in some cases,
-// like monitoring, the state would increase memory usage.
-int *lua_states_clean = NULL;
-pthread_mutex_t* lua_states_locks = NULL;
-int num_states = 0;
-char** defines = NULL;
-bstring *bdefines = NULL;
-int* num_defines = NULL;
-bstring bglob_defines;
-bstring bglob_defines_list;
 
 
 static int totalgroups = 0;
@@ -107,7 +86,7 @@ isdir(char* dirname)
 }
 
 void
-return_groups(int groups, char** groupnames, char** groupshort, char** grouplong)
+perfgroup_returnGroups(int groups, char** groupnames, char** groupshort, char** grouplong)
 {
     int i;
     int freegroups = (totalgroups < groups ? groups : totalgroups);
@@ -148,7 +127,7 @@ return_groups(int groups, char** groupnames, char** groupshort, char** grouplong
 
 
 int
-get_groups(
+perfgroup_getGroups(
         const char* grouppath,
         const char* architecture,
         char*** groupnames,
@@ -168,9 +147,12 @@ get_groups(
     bstring SHORT = bformat("SHORT");
     bstring LONG = bformat("LONG");
     bstring REQUIRE = bformat("REQUIRE_NOHT");
+    char* Home = getenv("HOME");
+
     int read_long = 0;
-    if ((grouppath == NULL)||(architecture == NULL)||(groupnames == NULL))
+    if ((grouppath == NULL)||(architecture == NULL)||(groupnames == NULL)||(Home == NULL))
         return -EINVAL;
+
     char* fullpath = malloc((strlen(grouppath)+strlen(architecture)+50) * sizeof(char));
     if (fullpath == NULL)
     {
@@ -179,7 +161,7 @@ get_groups(
         bdestroy(REQUIRE);
         return -ENOMEM;
     }
-    char* homepath = malloc((strlen(getenv("HOME"))+strlen(architecture)+50) * sizeof(char));
+    char* homepath = malloc((strlen(Home)+strlen(architecture)+50) * sizeof(char));
     if (homepath == NULL)
     {
         free(fullpath);
@@ -225,7 +207,7 @@ get_groups(
         }
     }
     closedir(dp);
-    hsize = sprintf(homepath, "%s/.likwid/groups/%s", getenv("HOME"), architecture);
+    hsize = sprintf(homepath, "%s/.likwid/groups/%s", Home, architecture);
     if (isdir(homepath))
     {
         search_home = 1;
@@ -378,7 +360,7 @@ get_groups(
                             free(fullpath);
                             bdestroy(long_info);
                             bstrListDestroy(linelist);
-                            return_groups(i, *groupnames, *groupshort, *grouplong);
+                            perfgroup_returnGroups(i, *groupnames, *groupshort, *grouplong);
                             return -ENOMEM;
                         }
                         s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
@@ -443,7 +425,7 @@ get_groups(
     if (!search_home)
     {
         if (i==0)
-            return_groups(totalgroups, *groupnames, *groupshort, *grouplong);
+            perfgroup_returnGroups(totalgroups, *groupnames, *groupshort, *grouplong);
         /*else if (i < totalgroups)
         {
             for (s=i;s<totalgroups;s++)
@@ -524,7 +506,7 @@ get_groups(
                                 free(fullpath);
                                 bstrListDestroy(linelist);
                                 bdestroy(long_info);
-                                return_groups(i, *groupnames, *groupshort, *grouplong);
+                                perfgroup_returnGroups(i, *groupnames, *groupshort, *grouplong);
                                 return -ENOMEM;
                             }
                             s = sprintf((*groupshort)[i], "%s", bdata(sinfo));
@@ -586,7 +568,7 @@ get_groups(
         closedir(dp);
     }
     if (i==0)
-        return_groups(totalgroups, *groupnames, *groupshort, *grouplong);
+        perfgroup_returnGroups(totalgroups, *groupnames, *groupshort, *grouplong);
 /*    else if (i < totalgroups)
     {
         for (s=i;s<totalgroups;s++)
@@ -607,7 +589,7 @@ get_groups(
 
 
 
-int custom_group(const char* eventStr, GroupInfo* ginfo)
+int perfgroup_customGroup(const char* eventStr, GroupInfo* ginfo)
 {
     int i, j;
     int err = 0;
@@ -627,10 +609,18 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
     ginfo->longinfo = NULL;
     bstring eventBstr;
     struct bstrList * eventList;
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
     bstring fix0 = bformat("FIXC0");
     bstring fix1 = bformat("FIXC1");
     bstring fix2 = bformat("FIXC2");
+#endif
+#ifdef _ARCH_PPC
+    bstring fix0 = bformat("PMC4");
+    bstring fix1 = bformat("PMC5");
+#endif
+#ifdef LIKWID_WITH_NVMON
     bstring gpu = bformat("GPU");
+#endif
     DEBUG_PRINT(DEBUGLEV_INFO, Creating custom group for event string %s, eventStr);
     ginfo->shortinfo = malloc(7 * sizeof(char));
     if (ginfo->shortinfo == NULL)
@@ -656,7 +646,7 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
     eventBstr = bfromcstr(eventStr);
     eventList = bsplit(eventBstr, delim);
     ginfo->nevents = eventList->qty;
-    if (cpuid_info.isIntel)
+    if (cpuid_info.isIntel || cpuid_info.family == PPC_FAMILY)
     {
         if (binstr(eventBstr, 0, fix0) > 0)
         {
@@ -674,6 +664,7 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
         {
             ginfo->nevents++;
         }
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
         if (binstr(eventBstr, 0, fix2) > 0)
         {
             has_fix2 = 1;
@@ -682,6 +673,7 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
         {
             ginfo->nevents++;
         }
+#endif
     }
     bdestroy(eventBstr);
 
@@ -730,14 +722,17 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
         }
         sprintf(ginfo->events[i], "%s", bdata(elist->entry[0]));
         snprintf(ginfo->counters[i], blength(ctr)+1, "%s", bdata(ctr));
+#ifdef LIKWID_WITH_NVMON
         if (binstr(elist->entry[1], 0, gpu) != BSTR_ERR)
         {
             gpu_events++;
         }
+#endif
         bdestroy(ctr);
         bstrListDestroy(elist);
     }
     i = eventList->qty;
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
     if (cpuid_info.isIntel && i != gpu_events)
     {
         if ((!has_fix0) && cpuid_info.perf_num_fixed_ctr > 0)
@@ -766,19 +761,43 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
         }
     }
     ginfo->nevents = i;
+#endif
+#ifdef _ARCH_PPC
+    if (!has_fix0)
+    {
+        ginfo->events[i] = malloc(18 * sizeof(char));
+        ginfo->counters[i] = malloc(6 * sizeof(char));
+        sprintf(ginfo->events[i], "%s", "PM_RUN_INST_CMPL");
+        sprintf(ginfo->counters[i], "%s", "PMC4");
+        i++;
+    }
+    if (!has_fix1)
+    {
+        ginfo->events[i] = malloc(22 * sizeof(char));
+        ginfo->counters[i] = malloc(6 * sizeof(char));
+        sprintf(ginfo->events[i], "%s", "PM_RUN_CYC");
+        sprintf(ginfo->counters[i], "%s", "PMC5");
+        i++;
+    }
+#endif
     bstrListDestroy(eventList);
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
     bdestroy(fix0);
     bdestroy(fix1);
     bdestroy(fix2);
-    bdestroy(gpu);
+#endif
     bdestroy(edelim);
     return 0;
 cleanup:
     bstrListDestroy(eventList);
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
     bdestroy(fix0);
     bdestroy(fix1);
     bdestroy(fix2);
+#endif
+#ifdef LIKWID_WITH_NVMON
     bdestroy(gpu);
+#endif
     bdestroy(edelim);
     if (ginfo->shortinfo != NULL)
         free(ginfo->shortinfo);
@@ -790,7 +809,7 @@ int custom_group(const char* eventStr, GroupInfo* ginfo)
 }
 
 int
-read_group(
+perfgroup_readGroup(
         const char* grouppath,
         const char* architecture,
         const char* groupname,
@@ -801,11 +820,12 @@ read_group(
     char buf[1024];
     GroupFileSections sec = GROUP_NONE;
     bstring REQUIRE = bformat("REQUIRE_NOHT");
-    if ((grouppath == NULL)||(architecture == NULL)||(groupname == NULL)||(ginfo == NULL))
+    char* Home = getenv("HOME");
+    if ((grouppath == NULL)||(architecture == NULL)||(groupname == NULL)||(ginfo == NULL)||(Home == NULL))
         return -EINVAL;
 
     bstring fullpath = bformat("%s/%s/%s.txt", grouppath,architecture, groupname);
-    bstring homepath = bformat("%s/.likwid/groups/%s/%s.txt", getenv("HOME"),architecture, groupname);
+    bstring homepath = bformat("%s/.likwid/groups/%s/%s.txt", Home,architecture, groupname);
 
     if (access(bdata(fullpath), R_OK))
     {
@@ -835,7 +855,6 @@ read_group(
     ginfo->metricformulas = NULL;
     ginfo->metricnames = NULL;
     ginfo->longinfo = NULL;
-    ginfo->lua_funcs = NULL;
     ginfo->groupname = (char*)malloc((strlen(groupname)+10)*sizeof(char));
     if (ginfo->groupname == NULL)
     {
@@ -1111,24 +1130,6 @@ read_group(
             sprintf(&(ginfo->longinfo[s]), "%.*s", (int)strlen(buf), buf);
             continue;
         }
-        else if (sec == GROUP_LUA)
-        {
-            s = (ginfo->lua_funcs == NULL ? 0 : strlen(ginfo->lua_funcs));
-            char *tmp;
-            tmp = realloc(ginfo->lua_funcs, (s + strlen(buf) + 3) * sizeof(char));
-            if (tmp == NULL)
-            {
-                free(ginfo->lua_funcs);
-                err = -ENOMEM;
-                goto cleanup;
-            }
-            else
-            {
-                ginfo->lua_funcs = tmp;
-            }
-            sprintf(&(ginfo->lua_funcs[s]), "%.*s", (int)strlen(buf), buf);
-            continue;
-        }
     }
     //bstrListDestroy(linelist);
     fclose(fp);
@@ -1146,8 +1147,6 @@ read_group(
         free(ginfo->shortinfo);
     if (ginfo->longinfo)
         free(ginfo->longinfo);
-    if (ginfo->lua_funcs)
-        free(ginfo->lua_funcs);
     if (ginfo->nevents > 0)
     {
         for(i=0;i<ginfo->nevents; i++)
@@ -1181,7 +1180,7 @@ read_group(
 }
 
 int
-new_group(GroupInfo* ginfo)
+perfgroup_new(GroupInfo* ginfo)
 {
     if (!ginfo)
         return -EINVAL;
@@ -1198,7 +1197,7 @@ new_group(GroupInfo* ginfo)
 }
 
 char*
-get_eventStr(GroupInfo* ginfo)
+perfgroup_getEventStr(GroupInfo* ginfo)
 {
     int i;
     char* string;
@@ -1227,7 +1226,7 @@ get_eventStr(GroupInfo* ginfo)
 }
 
 void
-put_eventStr(char* eventset)
+perfgroup_returnEventStr(char* eventset)
 {
     if (eventset != NULL)
     {
@@ -1237,7 +1236,7 @@ put_eventStr(char* eventset)
 }
 
 int
-add_event(GroupInfo* ginfo, char* event, char* counter)
+perfgroup_addEvent(GroupInfo* ginfo, char* counter, char* event)
 {
     if ((!ginfo) || (!event) || (!counter))
         return -EINVAL;
@@ -1259,32 +1258,63 @@ add_event(GroupInfo* ginfo, char* event, char* counter)
     return 0;
 }
 
+void perfgroup_removeEvent(GroupInfo* ginfo, char* counter)
+{
+    fprintf(stderr, "perfgroup_removeEvent not implemented\n");
+}
+
 int
-add_metric(GroupInfo* ginfo, char* mname, char* mcalc)
+perfgroup_addMetric(GroupInfo* ginfo, char* mname, char* mcalc)
 {
     if ((!ginfo) || (!mname) || (!mcalc))
         return -EINVAL;
     ginfo->metricnames = realloc(ginfo->metricnames, (ginfo->nmetrics + 1) * sizeof(char*));
     if (!ginfo->metricnames)
+    {
+        ERROR_PRINT(Cannot increase space for metricnames to %d bytes, (ginfo->nmetrics + 1) * sizeof(char*));
         return -ENOMEM;
+    }
     ginfo->metricformulas = realloc(ginfo->metricformulas, (ginfo->nmetrics + 1) * sizeof(char*));
     if (!ginfo->metricformulas)
+    {
+        ERROR_PRINT(Cannot increase space for metricformulas to %d bytes, (ginfo->nmetrics + 1) * sizeof(char*));
         return -ENOMEM;
+    }
     ginfo->metricnames[ginfo->nmetrics] = malloc((strlen(mname) + 1) * sizeof(char));
     if (!ginfo->metricnames[ginfo->nmetrics])
+    {
+        ERROR_PRINT(Cannot increase space for metricname to %d bytes, (strlen(mname) + 1) * sizeof(char));
         return -ENOMEM;
+    }
     ginfo->metricformulas[ginfo->nmetrics] = malloc((strlen(mcalc) + 1) * sizeof(char));
     if (!ginfo->metricformulas[ginfo->nmetrics])
+    {
+        ERROR_PRINT(Cannot increase space for metricformula to %d bytes, (strlen(mcalc) + 1) * sizeof(char));
         return -ENOMEM;
-    sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", mname);
-    sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", mcalc);
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Adding metric %s = %s, mname, mcalc);
+    int ret = sprintf(ginfo->metricnames[ginfo->nmetrics], "%s", mname);
+    if (ret > 0)
+    {
+        ginfo->metricnames[ginfo->nmetrics][ret] = '\0';
+    }
+    ret = sprintf(ginfo->metricformulas[ginfo->nmetrics], "%s", mcalc);
+    if (ret > 0)
+    {
+        ginfo->metricformulas[ginfo->nmetrics][ret] = '\0';
+    }
     ginfo->nmetrics++;
     return 0;
 }
 
+void perfgroup_removeMetric(GroupInfo* ginfo, char* mname)
+{
+    fprintf(stderr, "perfgroup_removeEvent not implemented\n");
+}
+
 
 char*
-get_groupName(GroupInfo* ginfo)
+perfgroup_getGroupName(GroupInfo* ginfo)
 {
     if ((ginfo != NULL) && (ginfo->groupname != NULL))
     {
@@ -1296,21 +1326,39 @@ get_groupName(GroupInfo* ginfo)
     return NULL;
 }
 
+void
+perfgroup_returnGroupName(char* gname)
+{
+    if (gname != NULL)
+    {
+        free(gname);
+        gname = NULL;
+    }
+}
+
 int
-set_groupName(GroupInfo* ginfo, char* groupName)
+perfgroup_setGroupName(GroupInfo* ginfo, char* groupName)
 {
     if ((ginfo == NULL) || (groupName == NULL))
         return -EINVAL;
     int size = strlen(groupName)+1;
     ginfo->groupname = realloc(ginfo->groupname, size * sizeof(char));
     if (ginfo->groupname == NULL)
+    {
+        ERROR_PRINT(Cannot increase space for groupname to %d bytes, size * sizeof(char));
         return -ENOMEM;
-    sprintf(ginfo->groupname, "%s", groupName);
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, Setting group name to %s, groupName);
+    int ret = sprintf(ginfo->groupname, "%s", groupName);
+    if (ret > 0)
+    {
+        ginfo->groupname[ret] = '\0';
+    }
     return 0;
 }
 
 char*
-get_shortInfo(GroupInfo* ginfo)
+perfgroup_getShortInfo(GroupInfo* ginfo)
 {
     if ((ginfo != NULL) && (ginfo->shortinfo != NULL))
     {
@@ -1323,7 +1371,7 @@ get_shortInfo(GroupInfo* ginfo)
 }
 
 void
-put_shortInfo(char* sinfo)
+perfgroup_returnShortInfo(char* sinfo)
 {
     if (sinfo != NULL)
     {
@@ -1333,7 +1381,7 @@ put_shortInfo(char* sinfo)
 }
 
 int
-set_shortInfo(GroupInfo* ginfo, char* shortInfo)
+perfgroup_setShortInfo(GroupInfo* ginfo, char* shortInfo)
 {
     if ((ginfo == NULL) || (shortInfo == NULL))
         return -EINVAL;
@@ -1346,7 +1394,7 @@ set_shortInfo(GroupInfo* ginfo, char* shortInfo)
 }
 
 char*
-get_longInfo(GroupInfo* ginfo)
+perfgroup_getLongInfo(GroupInfo* ginfo)
 {
     if ((ginfo != NULL) && (ginfo->longinfo != NULL))
     {
@@ -1359,7 +1407,7 @@ get_longInfo(GroupInfo* ginfo)
 }
 
 void
-put_longInfo(char* linfo)
+perfgroup_returnLongInfo(char* linfo)
 {
     if (linfo != NULL)
     {
@@ -1369,7 +1417,7 @@ put_longInfo(char* linfo)
 }
 
 int
-set_longInfo(GroupInfo* ginfo, char* longInfo)
+perfgroup_setLongInfo(GroupInfo* ginfo, char* longInfo)
 {
     if ((ginfo == NULL) || (longInfo == NULL))
         return -EINVAL;
@@ -1382,7 +1430,7 @@ set_longInfo(GroupInfo* ginfo, char* longInfo)
 }
 
 void
-return_group(GroupInfo* ginfo)
+perfgroup_returnGroup(GroupInfo* ginfo)
 {
     int i;
     if (ginfo->groupname)
@@ -1427,391 +1475,109 @@ return_group(GroupInfo* ginfo)
 }
 
 
-int calc_add_to_varlist(char* name, bstring bvarlist)
-{
-    int ret = 0;
-    if (!name)
-        return -EINVAL;
-    bstring bname = bformat("\"%s\"", name);
-    if (blength(bvarlist) > 0)
-        bcatcstr(bvarlist, ",");
-    ret = bconcat(bvarlist, bname);
-    bdestroy(bname);
-    return ret;
-}
-
-int calc_add_str_var(char* name, char* value, bstring vars, bstring varlist)
-{
-    int ret = 0;
-    bstring add = bformat("%s = %s\n", name, value);
-    ret = bconcat(vars, add);
-    if (ret == BSTR_OK)
-        ret = calc_add_to_varlist(name, varlist);
-    bdestroy(add);
-    return ret;
-}
-
-int calc_add_dbl_var(char* name, double value, bstring bvars, bstring varlist)
-{
-    int ret = 0;
-    bstring add = bformat("%s = %20.20f\n", name, value);
-    ret = bconcat(bvars, add);
-    if (ret == BSTR_OK)
-        ret = calc_add_to_varlist(name, varlist);
-    bdestroy(add);
-    return ret;
-}
-
-int calc_add_int_var(char* name, int value, bstring bvars, bstring varlist)
+int perfgroup_mergeGroups(GroupInfo* grp1, GroupInfo* grp2)
 {
-    int ret = 0;
-    bstring add = bformat("%s = %d\n", name, value);
-    ret = bconcat(bvars, add);
-    if (ret == BSTR_OK)
-        ret = calc_add_to_varlist(name, varlist);
-    bdestroy(add);
-    return ret;
+    fprintf(stderr, "perfgroup_mergeGroups not implemented\n");
+    return -1;
 }
 
-static int _calc_add_def(bstring add, int cpu)
-{
-    int ret = 0;
-    if (cpu < 0)
-    {
-        ret = bconcat(bglob_defines, add);
-    }
-    else
-    {
-        ret = bconcat(bdefines[cpu], add);
-    }
-    return ret;
-}
 
-// cpu == -1 means global definition
-int calc_add_dbl_def(char* name, double value, int cpu)
+void
+init_clist(CounterList* clist)
 {
-    int ret = 0;
-    bstring add = bformat("%s = %20.20f\n", name, value);
-    ret = _calc_add_def(add, cpu);
-    bdestroy(add);
-    if (!ret)
-        ret = calc_add_to_varlist(name, bglob_defines_list);
-    return ret;
+    clist->counters = 0;
+    clist->cnames = bstrListCreate();
+    clist->cvalues = bstrListCreate();
 }
 
-// cpu == -1 means global definition
-int calc_add_int_def(char* name, int value, int cpu)
+int
+add_to_clist(CounterList* clist, char* counter, double result)
 {
-    int ret = 0;
-    bstring add = bformat("%s = %d\n", name, value);
-    ret = _calc_add_def(add, cpu);
-    bdestroy(add);
-    if (!ret)
-        ret = calc_add_to_varlist(name, bglob_defines_list);
-    return ret;
-}
 
-// cpu == -1 means global definition
-int calc_add_str_def(char* name, char* value, int cpu)
-{
-    int ret = 0;
-    bstring add = bformat("%s = %s\n", name, value);
-    ret = _calc_add_def(add, cpu);
-    bdestroy(add);
-    if (!ret)
-        ret = calc_add_to_varlist(name, bglob_defines_list);
-    return ret;
+    bstrListAddChar(clist->cnames, counter);
+    bstring v = bformat("%.20f", result);
+    bstrListAdd(clist->cvalues, v);
+    clist->counters++;
+    bdestroy(v);
+    return 0;
 }
 
-
-int calc_set_user_funcs(char* s)
+int
+update_clist(CounterList* clist, char* counter, double result)
 {
-    if (!s)
+    int i;
+    int found = 0;
+    if ((clist == NULL)||(counter == NULL))
         return -EINVAL;
-    if (in_user_func_str)
-        free(in_user_func_str);
-
-    int i = 0;
-    while (not_allowed[i])
+    bstring c = bfromcstr(counter);
+    for (i=0; i< clist->counters; i++)
     {
-        char* p = strstr(s, not_allowed[i]);
-        if (p)
+        bstring comp = bstrListGet(clist->cnames, i);
+        if (bstrcmp(comp, c) == BSTR_OK)
         {
-            fprintf(stderr, "ERROR: User function string contains invalid commands\n");
-            return -EINVAL;
+            bstring v = bformat("%.20f", result);
+            bstring val = bstrListGet(clist->cvalues, i);
+            btrunc(val, 0);
+            bconcat(val, v);
+            bdestroy(v);
+            found = 1;
+            break;
         }
-        i++;
     }
-
-    // test user given functions
-    lua_State *L = luaL_newstate();
-    luaL_openlibs(L);
-    in_user_func_str = NULL;
-    int ret = luaL_dostring (L, s);
-    lua_close(L);
-    if (ret)
-    {
-        fprintf(stderr, "WARN: Defined functions not valid Lua\n");
-        return 1;
-    }
-    else
+    bdestroy(c);
+    if (!found)
     {
-        ret = asprintf(&in_user_func_str, "%s", s);
-        if (ret < 0)
-            return ret;
+        return -ENOENT;
     }
     return 0;
 }
 
-static double do_calc(int cpu, char* s, bstring vars)
-{
-    double res = NAN;
-    int ret = 0;
-    char* t = NULL;
-    lua_State *L = lua_states[cpu];
-    // Allocate a new Lua state for the cpu
-    if (lua_states && !L)
-    {
-        pthread_mutex_lock(&lua_states_locks[cpu]);
-        L = luaL_newstate();
-        luaL_openlibs(L);
-        lua_states[cpu] = L;
-        lua_states_clean[cpu] = LUA_STATES_CLEAN_DEFAULT;
-        pthread_mutex_unlock(&lua_states_locks[cpu]);
-    }
-    bstring scratch = bfromcstr(in_func_str);
-    bcatcstr(scratch, "\n");
-    if (blength(bglob_defines) > 0)
-    {
-        bconcat(scratch, bglob_defines);
-    }
-    if (bdefines[cpu])
-    {
-        bconcat(scratch, bdefines[cpu]);
-        bcatcstr(scratch, "\n");
-    }
-    if (in_user_func_str)
-    {
-        bcatcstr(scratch, in_user_func_str);
-        bcatcstr(scratch, "\n");
-    }
-    if (blength(vars) > 0)
-    {
-        bconcat(scratch, vars);
-        bcatcstr(scratch, "\n");
-    }
-
-    bcatcstr(scratch, "return ");
-    bcatcstr(scratch, s);
-    bcatcstr(scratch, "\n");
-
-    ret = luaL_dostring (L, bdata(scratch));
-    if (!ret)
-    {
-        if (strncmp(luaL_typename(L, -1), "number", 6) == 0)
-            res = lua_tonumber(L, -1);
-    }
-    bdestroy(scratch);
-
-    // decrement clean counter for cpu and close the Lua state if zero
-    pthread_mutex_lock(&lua_states_locks[cpu]);
-    lua_states_clean[cpu]--;
-    if (lua_states && lua_states[cpu] && lua_states_clean[cpu] == 0 )
-    {
-        lua_close(lua_states[cpu]);
-        lua_states[cpu] = NULL;
-    }
-    pthread_mutex_unlock(&lua_states_locks[cpu]);
-    return res;
-}
-
-static char* do_expand(int cpu, char* f, bstring varlist)
+void
+destroy_clist(CounterList* clist)
 {
-    int ret = 0;
-    char* out = NULL;
-    char* t = NULL;
-
-    // Allocate a new Lua state for the cpu
-    lua_State *L = lua_states[cpu];
-    if (lua_states && !L)
-    {
-        pthread_mutex_lock(&lua_states_locks[cpu]);
-        L = luaL_newstate();
-        luaL_openlibs(L);
-        lua_states[cpu] = L;
-        lua_states_clean[cpu] = LUA_STATES_CLEAN_DEFAULT;
-        pthread_mutex_unlock(&lua_states_locks[cpu]);
-    }
-    bstring scratch = bformat("varlist={%s,%s}\n%s\nreturn eval_str(\"%s\")", bdata(bglob_defines_list), bdata(varlist), in_expand_str, f);
-    if (ret < 0)
-    {
-        return NULL;
-    }
-    ret = luaL_dostring (L, bdata(scratch));
-    if (!ret)
-    {
-        out = (char*)lua_tostring(L, -1);
-    }
-    bdestroy(scratch);
-
-    // decrement clean counter for cpu and close the Lua state of the cpu if zero
-    pthread_mutex_lock(&lua_states_locks[cpu]);
-    lua_states_clean[cpu]--;
-    if (lua_states && lua_states[cpu] && lua_states_clean[cpu] == 0 )
+    int i;
+    if (clist != NULL)
     {
-        lua_close(lua_states[cpu]);
-        lua_states[cpu] = NULL;
+        bstrListDestroy(clist->cnames);
+        bstrListDestroy(clist->cvalues);
+        clist->counters = 0;
     }
-    pthread_mutex_unlock(&lua_states_locks[cpu]);
-    return out;
 }
 
-
-
 int
-calc_metric(int cpu, char* formula, bstring vars, bstring varlist, double *result)
+calc_metric(char* formula, CounterList* clist, double *result)
 {
     int i=0;
-    char* f;
+    *result = 0.0;
     int maxstrlen = 0, minstrlen = 10000;
-    char buf[128];
 
-    if ((formula == NULL) ||
-        (result == NULL) ||
-        (cpu < 0) ||
-        (cpu > cpuid_topology.numHWThreads) ||
-        (vars == NULL) ||
-        (varlist == NULL))
+    if ((formula == NULL) || (clist == NULL))
         return -EINVAL;
-    if (strlen(formula) == 0 || blength(vars) == 0 || blength(varlist) == 0)
-        return -EINVAL;
-    *result = NAN;
-
-/*    if (strchr(formula, '[') != NULL)*/
-/*    {*/
-/*        f = do_expand(cpu, formula, varlist);*/
-/*        if (f)*/
-/*        {*/
-/*            *result = do_calc(cpu, formula, vars);*/
-/*            return 0;*/
-/*        }*/
-/*    }*/
-/*    else*/
-/*    {*/
-/*        *result = do_calc(cpu, formula, vars);*/
-/*        return 0;*/
-/*    }*/
-/*    return 1;*/
-    *result = do_calc(cpu, formula, vars);
-    return 0;
-}
-
-void __attribute__((constructor (103))) init_perfgroup(void)
-{
-    int ret = 0;
-    topology_init();
-    CpuTopology_t cputopo = get_cpuTopology();
-    CpuInfo_t cpuinfo = get_cpuInfo();
-    int cpus = cputopo->numHWThreads;
-    lua_states = (lua_State**)malloc(cpus * sizeof(lua_State*));
-    if (lua_states)
-    {
-        //memset(lua_states, 0, cpus * sizeof(lua_State*));
-        for (int i = 0; i < cpus; i++)
-        {
-            lua_states[i] = NULL;
-        }
-    }
-    lua_states_clean = (int*)malloc(cpus * sizeof(int));
-    if (lua_states_clean)
-    {
-        memset(lua_states_clean, 0, cpus * sizeof(int));
-    }
-    lua_states_locks = (pthread_mutex_t*)malloc(cpus * sizeof(pthread_mutex_t));
-    if (lua_states_locks)
-    {
-        for (int i = 0; i < cpus; i++)
-        {
-            pthread_mutex_init(&lua_states_locks[i], NULL);
-        }
-    }
-    num_states = cpus;
-    bdefines = (bstring*)malloc(cpus * sizeof(bstring));
-    if (bdefines)
-    {
-        memset(bdefines, 0, cpus * sizeof(bdefines));
-        for (int i = 0; i < cpus; i++)
-        {
-            bdefines[i] = bformat("");
-            calc_add_int_def("CPUID", cputopo->threadPool[i].apicId, cputopo->threadPool[i].apicId);
-        }
-    }
-    num_defines = (int*)malloc(cpus * sizeof(int));
-    if (num_defines)
-    {
-        memset(num_defines, 0, cpus * sizeof(int));
-    }
-    bglob_defines = bformat("");
-    bglob_defines_list = bformat("");
-    calc_add_str_def("TRUE", "true", -1);
-    calc_add_str_def("FALSE", "false", -1);
-    calc_add_int_def("CPU_COUNT", cpus, -1);
-    calc_add_int_def("CPU_ACTIVE", cputopo->activeHWThreads, -1);
-    calc_add_int_def("SOCKET_COUNT", cputopo->numSockets, -1);
-    calc_add_int_def("CORES_PER_SOCKET", cputopo->numCoresPerSocket, -1);
-    calc_add_int_def("CPUS_PER_CORE", cputopo->numThreadsPerCore, -1);
-    for (int i= 0; i < cputopo->numCacheLevels; i++)
-    {
-        char name[100];
-        snprintf(name, 99, "L%d_SIZE", cputopo->cacheLevels[i].level);
-        calc_add_int_def(name, cputopo->cacheLevels[i].size, -1);
-        snprintf(name, 99, "L%d_LINESIZE", cputopo->cacheLevels[i].level);
-        calc_add_int_def(name, cputopo->cacheLevels[i].lineSize, -1);
-    }
-    calc_add_int_def("MEM_LINESIZE", 64, -1);
-    //topology_finalize();
-}
 
-void __attribute__((destructor (103))) close_perfgroup(void)
-{
-    if (lua_states && num_states > 0)
+    bstring f = bfromcstr(formula);
+    for(i=0;i<clist->counters;i++)
     {
-        for (int i = 0; i < num_states; i++)
-        {
-            if (lua_states[i])
-            {
-                lua_close(lua_states[i]);
-                lua_states[i] = NULL;
-            }
-        }
-        free(lua_states);
-        lua_states = NULL;
-    }
-    if (lua_states_clean)
-    {
-        free(lua_states_clean);
-        lua_states_clean = NULL;
-    }
-    if (lua_states_locks)
-    {
-        for (int i = 0; i < num_states; i++)
-        {
-            pthread_mutex_destroy(&lua_states_locks[i]);
-        }
-        free(lua_states_locks);
-        lua_states_locks = NULL;
+        bstring c = bstrListGet(clist->cnames, i);
+        int len = blength(c);
+        maxstrlen = (maxstrlen > len ? maxstrlen : len);
+        minstrlen = (minstrlen < len ? minstrlen : len);
     }
 
-    for (int i = 0; i < num_states; i++)
+    // try to replace each counter name in clist
+    while (maxstrlen >= minstrlen)
     {
-        if (bdefines[i])
+        for(i=0;i<clist->counters;i++)
         {
-            bdestroy(bdefines[i]);
-            num_defines[i] = 0;
+            bstring c = bstrListGet(clist->cnames, i);
+            if (blength(c) != maxstrlen)
+                continue;
+            bstring v = bstrListGet(clist->cvalues, i);
+            bfindreplace(f, c, v, 0);
         }
+        maxstrlen--;
     }
-    bdestroy(bglob_defines_list);
-    bdestroy(bglob_defines);
-    free(num_defines);
-    num_states = 0;
+    // now we can calculate the formula
+    i = calculate_infix(bdata(f), result);
+    bdestroy(f);
+    return i;
 }
diff --git a/src/perfmon.c b/src/perfmon.c
index 7c124ff95..6344fab84 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -75,6 +75,7 @@
 #include <perfmon_skylake.h>
 #include <perfmon_cascadelake.h>
 #include <perfmon_zen.h>
+#include <perfmon_zen2.h>
 #include <perfmon_a57.h>
 #include <perfmon_a15.h>
 
@@ -82,6 +83,11 @@
 #include <perfmon_perfevent.h>
 #endif
 
+#ifdef _ARCH_PPC
+#include <perfmon_power8.h>
+#include <perfmon_power9.h>
+#endif
+
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 PerfmonEvent* eventHash = NULL;
@@ -543,6 +549,14 @@ parseOptions(struct bstrList* tokens, PerfmonEvent* event, RegisterIndex index)
                                     event->numberOfOptions, EVENT_OPTION_PERF_FLAGS, 0);
             }
 #endif
+            else if (biseqcstr(subtokens->entry[0], "config") == 1)
+            {
+                event->eventId = strtoull(bdata(subtokens->entry[1]), NULL, 16);
+            }
+            else if (biseqcstr(subtokens->entry[0], "umask") == 1)
+            {
+                event->umask = strtoull(bdata(subtokens->entry[1]), NULL, 16);
+            }
             else
             {
                 fprintf(stderr, "WARN: Option '%s' unknown, skipping option\n", bdata(subtokens->entry[0]));
@@ -761,6 +775,22 @@ perfmon_check_counter_map(int cpu_id)
             counter_map[i].type = NOTYPE;
             counter_map[i].optionMask = 0x0ULL;
         }
+#else
+        char* path = translate_types[counter_map[i].type];
+        struct stat st;
+        if (path == NULL || stat(path, &st) != 0)
+        {
+            counter_map[i].type = NOTYPE;
+            counter_map[i].optionMask = 0x0ULL;
+        }
+        if (counter_map[i].type != PMC && counter_map[i].type != FIXED)
+        {
+            if (perfevent_paranoid_value() > 0 && getuid() != 0)
+            {
+                counter_map[i].type = NOTYPE;
+                counter_map[i].optionMask = 0x0ULL;
+            }
+        }
 #endif
     }
     if (own_hpm)
@@ -780,7 +810,7 @@ perfmon_check_counter_map(int cpu_id)
                 continue;
             PerfmonEvent event;
             bstring cstr = bfromcstr(counter_map[j].key);
-            if (getEvent(estr, cstr, &event))
+            if (getEvent(estr, cstr, &event) && checkCounter(cstr, eventHash[i].limit))
             {
                 found = 1;
                 bdestroy(cstr);
@@ -1119,13 +1149,51 @@ perfmon_init_maps(void)
             break;
 
         case ZEN_FAMILY:
-            eventHash = zen_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsZen;
-            counter_map = zen_counter_map;
-            box_map = zen_box_map;
-            perfmon_numCounters = perfmon_numCountersZen;
-            translate_types = zen_translate_types;
+            switch ( cpuid_info.model )
+            {
+                case ZEN_RYZEN:
+                    eventHash = zen_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsZen;
+                    counter_map = zen_counter_map;
+                    box_map = zen_box_map;
+                    perfmon_numCounters = perfmon_numCountersZen;
+                    translate_types = zen_translate_types;
+                    break;
+                case ZEN2_RYZEN:
+                    eventHash = zen2_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsZen2;
+                    counter_map = zen2_counter_map;
+                    box_map = zen2_box_map;
+                    perfmon_numCounters = perfmon_numCountersZen2;
+                    translate_types = zen2_translate_types;
+                    break;
+                default:
+                    ERROR_PLAIN_PRINT(Unsupported AMD Zen Processor);
+            }
             break;
+#ifdef _ARCH_PPC
+	case PPC_FAMILY:
+	    switch ( cpuid_info.model )
+            {
+                case POWER8:
+		    eventHash = power8_arch_events;
+	            counter_map = power8_counter_map;
+	            box_map = power8_box_map;
+                    translate_types = power8_translate_types;
+	            perfmon_numArchEvents = NUM_ARCH_EVENTS_POWER8;
+	            perfmon_numCounters = NUM_COUNTERS_POWER8;
+	            break;
+                case POWER9:
+		    eventHash = power9_arch_events;
+	            counter_map = power9_counter_map;
+	            box_map = power9_box_map;
+                    translate_types = power9_translate_types;
+	            perfmon_numArchEvents = NUM_ARCH_EVENTS_POWER9;
+	            perfmon_numCounters = NUM_COUNTERS_POWER9;
+	            break;
+	    }
+            break;
+#endif
 
         case ARMV7_FAMILY:
             switch ( cpuid_info.model )
@@ -1149,6 +1217,8 @@ perfmon_init_maps(void)
                     switch ( cpuid_info.part )
                     {
                         case ARM_CORTEX_A57:
+                        case ARM_CORTEX_A72:
+                        case ARM_CORTEX_A73:
                             eventHash = a57_arch_events;
                             perfmon_numArchEvents = perfmon_numArchEventsA57;
                             counter_map = a57_counter_map;
@@ -1156,6 +1226,7 @@ perfmon_init_maps(void)
                             perfmon_numCounters = perfmon_numCountersA57;
                             translate_types = a57_translate_types;
                             break;
+                        case ARM_CORTEX_A35:
                         case ARM_CORTEX_A53:
                             eventHash = a57_arch_events;
                             perfmon_numArchEvents = perfmon_numArchEventsA57;
@@ -1208,6 +1279,43 @@ perfmon_init_maps(void)
             ERROR_PLAIN_PRINT(Unsupported Processor);
             break;
     }
+    if (eventHash)
+    {
+        PerfmonEvent* tmp = malloc((perfmon_numArchEvents+10)*sizeof(PerfmonEvent));
+        if (tmp)
+        {
+            memcpy(tmp, eventHash, perfmon_numArchEvents*sizeof(PerfmonEvent));
+            eventHash = tmp;
+            eventHash[perfmon_numArchEvents].name = "GENERIC_EVENT";
+            bstring blim = bfromcstr("PMC");
+            for (int i = 0; i < perfmon_numArchEvents; i++)
+            {
+                bstring x = bfromcstr(eventHash[i].limit);
+                if (binstr(blim, 0, x) == BSTR_ERR)
+                {
+                    bconchar(blim, '|');
+                    bconcat(blim, x);
+                }
+                bdestroy(x);
+            }
+            eventHash[perfmon_numArchEvents].limit = malloc((blength(blim)+2)*sizeof(char));
+            int ret = snprintf(eventHash[perfmon_numArchEvents].limit,
+                               blength(blim)+1, "%s", bdata(blim));
+            if (ret > 0)
+            {
+                eventHash[perfmon_numArchEvents].limit[ret] = '\0';
+            }
+            bdestroy(blim);
+            eventHash[perfmon_numArchEvents].optionMask = EVENT_OPTION_GENERIC_CONFIG_MASK|EVENT_OPTION_GENERIC_UMASK_MASK;
+            eventHash[perfmon_numArchEvents].numberOfOptions = 2;
+            eventHash[perfmon_numArchEvents].options[0].type = EVENT_OPTION_GENERIC_CONFIG;
+            eventHash[perfmon_numArchEvents].options[0].value = 0x0ULL;
+            eventHash[perfmon_numArchEvents].options[1].type = EVENT_OPTION_GENERIC_UMASK;
+            eventHash[perfmon_numArchEvents].options[1].value = 0x0ULL;
+            perfmon_numArchEvents++;
+        }
+    }
+
     return;
 }
 
@@ -1471,13 +1579,27 @@ perfmon_init_funcs(int* init_power, int* init_temp)
             break;
 
         case ZEN_FAMILY:
-            initThreadArch = perfmon_init_zen;
-            initialize_power = TRUE;
-            perfmon_startCountersThread = perfmon_startCountersThread_zen;
-            perfmon_stopCountersThread = perfmon_stopCountersThread_zen;
-            perfmon_readCountersThread = perfmon_readCountersThread_zen;
-            perfmon_setupCountersThread = perfmon_setupCounterThread_zen;
-            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_zen;
+            switch ( cpuid_info.model )
+            {
+                case ZEN_RYZEN:
+                    initThreadArch = perfmon_init_zen;
+                    initialize_power = TRUE;
+                    perfmon_startCountersThread = perfmon_startCountersThread_zen;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_zen;
+                    perfmon_readCountersThread = perfmon_readCountersThread_zen;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_zen;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_zen;
+                    break;
+                case ZEN2_RYZEN:
+                    initThreadArch = perfmon_init_zen2;
+                    initialize_power = TRUE;
+                    perfmon_startCountersThread = perfmon_startCountersThread_zen2;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_zen2;
+                    perfmon_readCountersThread = perfmon_readCountersThread_zen2;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_zen2;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_zen2;
+                    break;
+            }
             break;
 
         default:
@@ -1718,6 +1840,11 @@ perfmon_finalize(void)
 #ifndef LIKWID_USE_PERFEVENT
     HPMfinalize();
 #endif
+    if (eventHash)
+    {
+        free(eventHash[perfmon_numArchEvents-1].limit);
+        free(eventHash);
+    }
     perfmon_initialized = 0;
     groupSet = NULL;
     return;
@@ -1810,9 +1937,9 @@ perfmon_addEventSet(const char* eventCString)
 
     if (strchr(cstringcopy, ':') == NULL)
     {
-        err = read_group(config->groupPath, cpuid_info.short_name,
-                         cstringcopy,
-                         &groupSet->groups[groupSet->numberOfActiveGroups].group);
+        err = perfgroup_readGroup(config->groupPath, cpuid_info.short_name,
+                                  cstringcopy,
+                                  &groupSet->groups[groupSet->numberOfActiveGroups].group);
         if (err == -EACCES)
         {
             ERROR_PRINT(Access to performance group %s not allowed, cstringcopy);
@@ -1832,14 +1959,14 @@ perfmon_addEventSet(const char* eventCString)
     }
     else
     {
-        err = custom_group(cstringcopy, &groupSet->groups[groupSet->numberOfActiveGroups].group);
+        err = perfgroup_customGroup(cstringcopy, &groupSet->groups[groupSet->numberOfActiveGroups].group);
         if (err)
         {
             ERROR_PRINT(Cannot transform %s to performance group, cstringcopy);
             return err;
         }
     }
-    char * evstr = get_eventStr(&groupSet->groups[groupSet->numberOfActiveGroups].group);
+    char * evstr = perfgroup_getEventStr(&groupSet->groups[groupSet->numberOfActiveGroups].group);
     if (perf_pid != NULL)
     {
         char* tmp = realloc(evstr, strlen(evstr)+strlen(perf_pid)+1);
@@ -1924,14 +2051,12 @@ perfmon_addEventSet(const char* eventCString)
                 event->type = NOTYPE;
                 goto past_checks;
             }
-#ifndef LIKWID_USE_PERFEVENT
             if (!checkCounter(subtokens->entry[1], event->event.limit))
             {
                 fprintf(stderr, "WARN: Register %s not allowed for event %s (limit %s)\n", bdata(subtokens->entry[1]),bdata(subtokens->entry[0]),event->event.limit);
                 event->type = NOTYPE;
                 goto past_checks;
             }
-#endif
             if (parseOptions(subtokens, &event->event, event->index) < 0)
             {
                 event->type = NOTYPE;
@@ -2010,6 +2135,13 @@ perfmon_addEventSet(const char* eventCString)
         fprintf(stderr,"       Either the events or counters do not exist for the\n");
         fprintf(stderr,"       current architecture. If event options are set, they might\n");
         fprintf(stderr,"       be invalid.\n");
+        perfgroup_returnGroup(&groupSet->groups[groupSet->numberOfActiveGroups].group);
+        for(j = 0; j < eventSet->numberOfEvents; j++)
+        {
+            PerfmonEventSetEntry* event = &(eventSet->events[j]);
+            free(event->threadCounter);
+        }
+        free(eventSet->events);
         return -EINVAL;
     }
 }
@@ -2019,7 +2151,7 @@ perfmon_delEventSet(int groupID)
 {
     if (groupID >= groupSet->numberOfGroups || groupID < 0)
         return;
-    return_group(&groupSet->groups[groupID].group);
+    perfgroup_returnGroup(&groupSet->groups[groupID].group);
     return;
 }
 
@@ -2482,7 +2614,7 @@ perfmon_getMetric(int groupId, int metricId, int threadId)
 {
     int e = 0;
     double result = 0;
-    char split[2] = ":";
+    CounterList clist;
     if (unlikely(groupSet == NULL))
     {
         return NAN;
@@ -2508,36 +2640,17 @@ perfmon_getMetric(int groupId, int metricId, int threadId)
     {
         return NAN;
     }
-    bstring vars = bformat("");
-    bstring varlist = bformat("");
-    char* f = groupSet->groups[groupId].group.metricformulas[metricId];
+    timer_init();
+    init_clist(&clist);
     for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
     {
-        if (groupSet->groups[groupId].events[e].type != NOTYPE)
-        {
-            char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-            if (ctr)
-                calc_add_dbl_var(ctr, perfmon_getResult(groupId, e, threadId), vars, varlist);
-        }
-        else
-        {
-            char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-            if (ctr && strstr(f, ctr) != NULL)
-            {
-                calc_add_int_var(ctr, 0, vars, varlist);
-            }
-        }
-    }
-    if (groupSet->groups[groupId].group.lua_funcs)
-    {
-        if (calc_set_user_funcs(groupSet->groups[groupId].group.lua_funcs))
-        {
-            free(groupSet->groups[groupId].group.lua_funcs);
-            groupSet->groups[groupId].group.lua_funcs = NULL;
-        }
+        add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+                     perfmon_getResult(groupId, e, threadId));
     }
-    calc_add_dbl_var("time", perfmon_getTimeOfGroup(groupId), vars, varlist);
-    calc_add_dbl_var("inverseClock", 1.0/timer_getCycleClock(), vars, varlist);
+    add_to_clist(&clist, "time", perfmon_getTimeOfGroup(groupId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    add_to_clist(&clist, "true", 1);
+    add_to_clist(&clist, "false", 0);
     int cpu = 0, sock_cpu = 0, err = 0;
     for (e=0; e<groupSet->numberOfThreads; e++)
     {
@@ -2556,46 +2669,34 @@ perfmon_getMetric(int groupId, int metricId, int threadId)
                 sock_cpu = groupSet->threads[e].thread_id;
             }
         }
-        calc_add_int_var("SOCKET_CPU", sock_cpu, vars, varlist);
         for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
         {
             if (perfmon_isUncoreCounter(groupSet->groups[groupId].group.counters[e]) &&
                 !perfmon_isUncoreCounter(groupSet->groups[groupId].group.metricformulas[metricId]))
             {
-                if (groupSet->groups[groupId].events[e].type != NOTYPE)
+                err = update_clist(&clist,groupSet->groups[groupId].group.counters[e], perfmon_getResult(groupId, e, sock_cpu));
+                if (err < 0)
                 {
-                    char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-                    double res = perfmon_getResult(groupId, e, sock_cpu);
-                    err = calc_add_dbl_var(ctr, res, vars, varlist);
-                    if (err < 0)
-                    {
-                        DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[groupId].group.counters[e], threadId);
-                    }
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[groupId].group.counters[e], threadId);
                 }
             }
         }
     }
-    else
-    {
-        calc_add_int_var("SOCKET_CPU", cpu, vars, varlist);
-    }
-
-    e = calc_metric(cpu, f, vars, varlist, &result);
-    bdestroy(vars);
-    bdestroy(varlist);
+    e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
     if (e < 0)
     {
-        result = NAN;
+        result = 0.0;
+        //ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
     }
+    destroy_clist(&clist);
     return result;
 }
-
 double
 perfmon_getLastMetric(int groupId, int metricId, int threadId)
 {
     int e = 0;
     double result = 0;
-    char split[2] = ":";
+    CounterList clist;
     if (unlikely(groupSet == NULL))
     {
         return NAN;
@@ -2621,36 +2722,17 @@ perfmon_getLastMetric(int groupId, int metricId, int threadId)
     {
         return NAN;
     }
-    bstring vars = bformat("");
-    bstring varlist = bformat("");
-    char* f = groupSet->groups[groupId].group.metricformulas[metricId];
+    timer_init();
+    init_clist(&clist);
     for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
     {
-        if (groupSet->groups[groupId].events[e].type != NOTYPE)
-        {
-            char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-            if (ctr)
-                calc_add_dbl_var(ctr, perfmon_getLastResult(groupId, e, threadId), vars, varlist);
-        }
-        else
-        {
-            char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-            if (ctr && strstr(f, ctr) != NULL)
-            {
-                calc_add_int_var(ctr, 0, vars, varlist);
-            }
-        }
-    }
-    if (groupSet->groups[groupId].group.lua_funcs)
-    {
-        if (calc_set_user_funcs(groupSet->groups[groupId].group.lua_funcs))
-        {
-            free(groupSet->groups[groupId].group.lua_funcs);
-            groupSet->groups[groupId].group.lua_funcs = NULL;
-        }
+        add_to_clist(&clist,groupSet->groups[groupId].group.counters[e],
+                     perfmon_getLastResult(groupId, e, threadId));
     }
-    calc_add_dbl_var("time", perfmon_getLastTimeOfGroup(groupId), vars, varlist);
-    calc_add_dbl_var("inverseClock", 1.0/timer_getCycleClock(), vars, varlist);
+    add_to_clist(&clist, "time", perfmon_getLastTimeOfGroup(groupId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    add_to_clist(&clist, "true", 1);
+    add_to_clist(&clist, "false", 0);
     int cpu = 0, sock_cpu = 0, err = 0;
     for (e=0; e<groupSet->numberOfThreads; e++)
     {
@@ -2659,7 +2741,6 @@ perfmon_getLastMetric(int groupId, int metricId, int threadId)
             cpu = groupSet->threads[e].processorId;
         }
     }
-    calc_add_int_var("CPU", cpu, vars, varlist);
     sock_cpu = socket_lock[affinity_thread2socket_lookup[cpu]];
     if (cpu != sock_cpu)
     {
@@ -2670,37 +2751,26 @@ perfmon_getLastMetric(int groupId, int metricId, int threadId)
                 sock_cpu = groupSet->threads[e].thread_id;
             }
         }
-        calc_add_int_var("SOCKET_CPU", sock_cpu, vars, varlist);
         for (e=0;e<groupSet->groups[groupId].numberOfEvents;e++)
         {
             if (perfmon_isUncoreCounter(groupSet->groups[groupId].group.counters[e]) &&
                 !perfmon_isUncoreCounter(groupSet->groups[groupId].group.metricformulas[metricId]))
             {
-                if (groupSet->groups[groupId].events[e].type != NOTYPE)
+                err = update_clist(&clist,groupSet->groups[groupId].group.counters[e], perfmon_getLastResult(groupId, e, sock_cpu));
+                if (err < 0)
                 {
-                    char *ctr = strtok(groupSet->groups[groupId].group.counters[e], split);
-                    double res = perfmon_getLastResult(groupId, e, sock_cpu);
-                    err = calc_add_dbl_var(ctr, res, vars, varlist);
-                    if (err < 0)
-                    {
-                        DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[groupId].group.counters[e], threadId);
-                    }
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[groupId].group.counters[e], threadId);
                 }
             }
         }
     }
-    else
-    {
-        calc_add_int_var("SOCKET_CPU", cpu, vars, varlist);
-    }
-
-    e = calc_metric(cpu, f, vars, varlist, &result);
-    bdestroy(vars);
-    bdestroy(varlist);
+    e = calc_metric(groupSet->groups[groupId].group.metricformulas[metricId], &clist, &result);
     if (e < 0)
     {
-        result = NAN;
+        result = 0.0;
+        //ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[groupId].group.metricformulas[metricId]);
     }
+    destroy_clist(&clist);
     return result;
 }
 
@@ -2715,11 +2785,18 @@ __perfmon_switchActiveGroupThread(int thread_id, int new_group)
         ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
         return -EINVAL;
     }
-
-    timer_stop(&groupSet->groups[groupSet->activeGroup].timer);
-    groupSet->groups[groupSet->activeGroup].rdtscTime =
-                timer_print(&groupSet->groups[groupSet->activeGroup].timer);
-    groupSet->groups[groupSet->activeGroup].runTime += groupSet->groups[groupSet->activeGroup].rdtscTime;
+    if (thread_id < 0 || thread_id >= groupSet->numberOfThreads)
+    {
+        return -EINVAL;
+    }
+    if (new_group < 0 || new_group >= groupSet->numberOfGroups)
+    {
+        return -EINVAL;
+    }
+    if (new_group == groupSet->activeGroup)
+    {
+        return 0;
+    }
     state = groupSet->groups[groupSet->activeGroup].state;
 
     if (state == STATE_START)
@@ -2734,6 +2811,7 @@ __perfmon_switchActiveGroupThread(int thread_id, int new_group)
             groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].init = FALSE;
         }
     }
+    // This updates groupSet->activeGroup to new_group
     ret = perfmon_setupCounters(new_group);
     if (ret != 0)
     {
@@ -3018,14 +3096,14 @@ perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos)
     int ret = 0;
     init_configuration();
     Configuration_t config = get_configuration();
-    ret = get_groups(config->groupPath, cpuid_info.short_name, groups, shortinfos, longinfos);
+    ret = perfgroup_getGroups(config->groupPath, cpuid_info.short_name, groups, shortinfos, longinfos);
     return ret;
 }
 
 void
 perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos)
 {
-    return_groups(nrgroups, groups, shortinfos, longinfos);
+    perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos);
 }
 
 int
@@ -3279,9 +3357,7 @@ perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
 {
     int e = 0, err = 0;
     double result = 0.0;
-    char split[2] = ":";
-    bstring vars = bformat("");
-    bstring varlist = bformat("");
+    CounterList clist;
     if (perfmon_initialized != 1)
     {
         ERROR_PLAIN_PRINT(Perfmon module not properly initialized);
@@ -3303,38 +3379,25 @@ perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
     {
         return NAN;
     }
-
-    char* f = groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId];
+    timer_init();
+    init_clist(&clist);
     for (e=0;e<markerResults[region].eventCount;e++)
     {
-        if (groupSet->groups[markerResults[region].groupID].events[e].type != NOTYPE)
-        {
-            char* ctr = strtok(groupSet->groups[markerResults[region].groupID].group.counters[e], split);
-            if (ctr)
-            {
-                double res = perfmon_getResultOfRegionThread(region, e, threadId);
-                calc_add_dbl_var(ctr, res, vars, varlist);
-            }
-        }
-        else
-        {
-            char *ctr = strtok(groupSet->groups[markerResults[region].groupID].group.counters[e], split);
-            if (ctr && strstr(f, ctr) != NULL)
-            {
-                calc_add_int_var(ctr, 0, vars, varlist);
-            }
-        }
-    }
-    if (groupSet->groups[markerResults[region].groupID].group.lua_funcs)
-    {
-        if (calc_set_user_funcs(groupSet->groups[markerResults[region].groupID].group.lua_funcs))
+        err = add_to_clist(&clist,
+                     groupSet->groups[markerResults[region].groupID].group.counters[e],
+                     perfmon_getResultOfRegionThread(region, e, threadId));
+        if (err)
         {
-            free(groupSet->groups[markerResults[region].groupID].group.lua_funcs);
-            groupSet->groups[markerResults[region].groupID].group.lua_funcs = NULL;
+            printf("Cannot add counter %s to counter list for metric calculation\n",
+                    counter_map[groupSet->groups[markerResults[region].groupID].events[e].index].key);
+            destroy_clist(&clist);
+            return 0;
         }
     }
-    calc_add_dbl_var("time", perfmon_getTimeOfRegion(region, threadId), vars, varlist);
-    calc_add_dbl_var("inverseClock", 1.0/timer_getCycleClock(), vars, varlist);
+    add_to_clist(&clist, "time", perfmon_getTimeOfRegion(region, threadId));
+    add_to_clist(&clist, "inverseClock", 1.0/timer_getCycleClock());
+    add_to_clist(&clist, "true", 1);
+    add_to_clist(&clist, "false", 0);
     int cpu = 0, sock_cpu = 0;
     for (e=0; e<groupSet->numberOfThreads; e++)
     {
@@ -3343,7 +3406,6 @@ perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
             cpu = groupSet->threads[e].processorId;
         }
     }
-
     sock_cpu = socket_lock[affinity_thread2socket_lookup[cpu]];
     if (cpu != sock_cpu)
     {
@@ -3354,38 +3416,25 @@ perfmon_getMetricOfRegionThread(int region, int metricId, int threadId)
                 sock_cpu = groupSet->threads[e].thread_id;
             }
         }
-        calc_add_int_var("SOCKET_CPU", sock_cpu, vars, varlist);
         for (e=0;e<markerResults[region].eventCount;e++)
         {
             if (perfmon_isUncoreCounter(groupSet->groups[markerResults[region].groupID].group.counters[e]) &&
                 !perfmon_isUncoreCounter(groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId]))
             {
-                if (groupSet->groups[markerResults[region].groupID].events[e].type != NOTYPE)
+                err = update_clist(&clist,groupSet->groups[markerResults[region].groupID].group.counters[e], perfmon_getResultOfRegionThread(region, e, sock_cpu));
+                if (err < 0)
                 {
-                    char *ctr = strtok(groupSet->groups[markerResults[region].groupID].group.counters[e], split);
-                    double res = perfmon_getResultOfRegionThread(region, e, sock_cpu);
-                    err = calc_add_dbl_var(ctr, res, vars, varlist);
-                    if (err < 0)
-                    {
-                        DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[markerResults[region].groupID].group.counters[e], threadId);
-                    }
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot add socket result of counter %s for thread %d, groupSet->groups[markerResults[region].groupID].group.counters[e], threadId);
                 }
             }
         }
     }
-    else
-    {
-        calc_add_int_var("SOCKET_CPU", cpu, vars, varlist);
-    }
-
-    err = calc_metric(cpu, f, vars, varlist, &result);
-    bdestroy(vars);
-    bdestroy(varlist);
+    err = calc_metric(groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId], &clist, &result);
     if (err < 0)
     {
-        ERROR_PRINT(Cannot calculate formula %s, f);
-        return NAN;
+        ERROR_PRINT(Cannot calculate formula %s, groupSet->groups[markerResults[region].groupID].group.metricformulas[metricId]);
     }
+    destroy_clist(&clist);
     return result;
 }
 
@@ -3553,4 +3602,3 @@ perfmon_destroyMarkerResults()
         free(markerResults);
     }
 }
-
diff --git a/src/power.c b/src/power.c
index 1ad601fdd..c94815c9d 100644
--- a/src/power.c
+++ b/src/power.c
@@ -58,13 +58,22 @@ power_init(int cpuId)
     int err;
     uint32_t unit_reg = MSR_RAPL_POWER_UNIT;
     int numDomains = NUM_POWER_DOMAINS;
+    Configuration_t config;
 
     /* determine Turbo Mode features */
     double busSpeed;
-    if (power_initialized || !lock_check())
+    if (power_initialized)
     {
         return 0;
     }
+    if (!lock_check())
+    {
+        ERROR_PLAIN_PRINT(Access to performance monitoring registers locked);
+        return -ENOLCK;
+    }
+    init_configuration();
+    config = get_configuration();
+
 
     power_info.baseFrequency = 0;
     power_info.minFrequency = 0;
@@ -76,6 +85,11 @@ power_init(int cpuId)
     power_info.uncoreMinFreq = 0;
     power_info.uncoreMaxFreq = 0;
     power_info.perfBias = 0;
+    if (config->daemonMode == ACCESSMODE_PERF)
+    {
+        ERROR_PRINT(RAPL in access mode 'perf_event' only available with perfmon);
+        return 0;
+    }
 
     switch (cpuid_info.family)
     {
@@ -128,7 +142,8 @@ power_init(int cpuId)
             }
             break;
         case ZEN_FAMILY:
-            if (cpuid_info.model == ZEN_RYZEN)
+            if (cpuid_info.model == ZEN_RYZEN ||
+                cpuid_info.model == ZEN2_RYZEN)
             {
                 cpuid_info.turbo = 0;
                 power_info.hasRAPL = 1;
@@ -650,4 +665,3 @@ get_powerInfo(void)
 {
     return &power_info;
 }
-
diff --git a/src/timer.c b/src/timer.c
index 1f008e034..410954618 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -139,6 +139,23 @@ fRDTSCP(TscCounter* cpu_c)
 #endif
 #endif
 
+#if defined(_ARCH_PPC)
+static void
+TIMER(TscCounter* cpu_c)
+{
+    uint32_t tbl, tbu0, tbu1;
+
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
+
+    (cpu_c)->int64 = (((uint64_t)tbu0) << 32) | tbl;
+}
+#endif
+
+
 static int os_timer(TscCounter* time)
 {
     int ret;
@@ -426,6 +443,10 @@ timer_init( void )
 #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A)
         TSTART = os_timer_start;
         TSTOP = os_timer_stop;
+#endif
+#ifdef _ARCH_PPC
+        TSTART = TIMER;
+        TSTOP = TIMER;
 #endif
     }
     if (cpuClock == 0ULL)
diff --git a/src/topology.c b/src/topology.c
index 86a5bcc85..0a6fe5fe8 100644
--- a/src/topology.c
+++ b/src/topology.c
@@ -107,15 +107,20 @@ static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
 static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
 static char* amd_k8_str = "AMD K8 architecture";
 static char* amd_zen_str = "AMD K17 (Zen) architecture";
+static char* amd_zen2_str = "AMD K17 (Zen2) architecture";
 static char* armv7l_str = "ARM 7l architecture";
 static char* armv8_str = "ARM 8 architecture";
 static char* cavium_thunderx2t99_str = "Cavium Thunder X2 (ARMv8)";
 static char* cavium_thunderx_str = "Cavium Thunder X (ARMv8)";
 static char* arm_cortex_a57 = "ARM Cortex A57 (ARMv8)";
 static char* arm_cortex_a53 = "ARM Cortex A53 (ARMv8)";
+static char* power7_str = "POWER7 architecture";
+static char* power8_str = "POWER8 architecture";
+static char* power9_str = "POWER9 architecture";
 
 static char* unknown_intel_str = "Unknown Intel Processor";
 static char* unknown_amd_str = "Unknown AMD Processor";
+static char* unknown_power_str = "Unknown POWER Processor";
 
 static char* short_core2 = "core2";
 static char* short_atom = "atom";
@@ -151,12 +156,17 @@ static char* short_k10 = "k10";
 static char* short_k15 = "interlagos";
 static char* short_k16 = "kabini";
 static char* short_zen = "zen";
+static char* short_zen2 = "zen2";
 
 static char* short_arm7 = "arm7";
 static char* short_arm8 = "arm8";
 static char* short_arm8_cav_tx2 = "arm8_tx2";
 static char* short_arm8_cav_tx = "arm8_tx";
 
+static char* short_power7 = "power7";
+static char* short_power8 = "power8";
+static char* short_power9 = "power9";
+
 static char* short_unknown = "unknown";
 
 /* #####  EXPORTED VARIABLES  ########################################## */
@@ -901,9 +911,41 @@ topology_setName(void)
             cpuid_info.short_name = short_k16;
             break;
 
+	case PPC_FAMILY:
+            switch(cpuid_info.model)
+            {
+                case POWER7:
+                    cpuid_info.name = power7_str;
+                    cpuid_info.short_name = short_power7;
+                    break;
+                case POWER8:
+                    cpuid_info.name = power8_str;
+                    cpuid_info.short_name = short_power8;
+                    break;
+                case POWER9:
+                    cpuid_info.name = power9_str;
+                    cpuid_info.short_name = short_power9;
+                    break;
+                default:
+                    cpuid_info.name = unknown_power_str;
+                    cpuid_info.short_name = short_unknown;
+                    break;
+           }
+           break;
+
+
         case ZEN_FAMILY:
-            cpuid_info.name = amd_zen_str;
-            cpuid_info.short_name = short_zen;
+            switch (cpuid_info.model)
+            {
+                case ZEN_RYZEN:
+                    cpuid_info.name = amd_zen_str;
+                    cpuid_info.short_name = short_zen;
+                    break;
+                case ZEN2_RYZEN:
+                    cpuid_info.name = amd_zen2_str;
+                    cpuid_info.short_name = short_zen2;
+                    break;
+            }
             break;
 
         case ARMV7_FAMILY:
@@ -1069,7 +1111,7 @@ topology_init(void)
         sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
         if (cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF))
         {
-#if !defined(__ARM_ARCH_7A__)
+#if !defined(__ARM_ARCH_7A__) && !defined(__ARM_ARCH_8A)
             cpuid_topology.activeHWThreads =
                 ((cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF)) ?
                 cpu_count(&cpuSet) :
@@ -1283,7 +1325,15 @@ print_supportedCPUs (void)
     printf("\t%s\n",interlagos_str);
     printf("\t%s\n",kabini_str);
     printf("\t%s\n",amd_zen_str);
+    printf("\t%s\n",amd_zen2_str);
+    printf("\n");
+    printf("Supported ARMv8 processors:\n");
+    printf("\t%s\n",arm_cortex_a53);
+    printf("\t%s\n",arm_cortex_a57);
+    printf("\t%s\n",cavium_thunderx_str);
+    printf("\t%s\n",cavium_thunderx2t99_str);
     printf("\n");
+
 }
 
 CpuTopology_t
@@ -1303,4 +1353,3 @@ get_numaTopology(void)
 {
     return &numa_info;
 }
-
diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c
index 6911079fe..886e2a727 100644
--- a/src/topology_hwloc.c
+++ b/src/topology_hwloc.c
@@ -51,7 +51,7 @@ hwloc_topology_t hwloc_topology = NULL;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 #if defined(__ARM_ARCH_8A) || defined(__ARM_ARCH_7A__)
-int parse_cpuinfo(uint32_t* family, uint32_t* variant, uint32_t *stepping, uint32_t *part, uint32_t *vendor)
+int parse_cpuinfo(uint32_t* count, uint32_t* family, uint32_t* variant, uint32_t *stepping, uint32_t *part, uint32_t *vendor)
 {
     int i = 0;
     FILE *fp = NULL;
@@ -60,23 +60,29 @@ int parse_cpuinfo(uint32_t* family, uint32_t* variant, uint32_t *stepping, uint3
     uint32_t s = 0;
     uint32_t p = 0;
     uint32_t vend = 0;
+    uint32_t c = 0;
     int (*ownatoi)(const char*);
     ownatoi = &atoi;
 
     if (NULL != (fp = fopen ("/proc/cpuinfo", "r")))
     {
-        const_bstring familyString = bformat("CPU architecture:");
-        const_bstring variantString = bformat("CPU variant\t:");
-        const_bstring steppingString = bformat("CPU revision\t:");
-        const_bstring partString = bformat("CPU part\t:");
-        const_bstring vendString = bformat("CPU implementer\t:");
+        const_bstring familyString = bformat("CPU architecture");
+        const_bstring variantString = bformat("CPU variant");
+        const_bstring steppingString = bformat("CPU revision");
+        const_bstring partString = bformat("CPU part");
+        const_bstring vendString = bformat("CPU implementer");
+        const_bstring procString = bformat("processor");
         bstring src = bread ((bNread) fread, fp);
         struct bstrList* tokens = bsplit(src,(char) '\n');
         bdestroy(src);
         fclose(fp);
         for (i=0;i<tokens->qty;i++)
         {
-            if ((f == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
+            if ((f == 0) && (binstr(tokens->entry[i],0,procString) != BSTR_ERR))
+            {
+                c++;
+            }
+            else if ((f == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
             {
                 struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
                 bltrimws(subtokens->entry[1]);
@@ -126,6 +132,7 @@ int parse_cpuinfo(uint32_t* family, uint32_t* variant, uint32_t *stepping, uint3
     *stepping = s;
     *part = p;
     *vendor = vend;
+    *count = c;
     return 0;
 }
 
@@ -134,7 +141,9 @@ int parse_cpuname(char *name)
     FILE *fp = NULL;
     if (NULL != (fp = fopen ("/proc/cpuinfo", "r")))
     {
-        const_bstring nameString = bformat("Hardware\t:");
+        int found = 0;
+        const_bstring nameString = bformat("Hardware");
+        const_bstring nameString2 = bformat("model name");
         bstring src = bread ((bNread) fread, fp);
         struct bstrList* tokens = bsplit(src,(char) '\n');
         bdestroy(src);
@@ -147,6 +156,22 @@ int parse_cpuname(char *name)
                 bltrimws(subtokens->entry[1]);
                 strncpy(name, bdata(subtokens->entry[1]), MAX_MODEL_STRING_LENGTH-1);
                 bstrListDestroy(subtokens);
+                found = 1;
+                break;
+            }
+        }
+        if (!found)
+        {
+            for (int i = 0; i < tokens->qty; i++)
+            {
+                if ((binstr(tokens->entry[i],0,nameString2) != BSTR_ERR))
+                {
+                    struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                    bltrimws(subtokens->entry[1]);
+                    strncpy(name, bdata(subtokens->entry[1]), MAX_MODEL_STRING_LENGTH-1);
+                    bstrListDestroy(subtokens);
+                    break;
+                }
             }
         }
         bstrListDestroy(tokens);
@@ -219,7 +244,11 @@ hwloc_init_cpuInfo(cpu_set_t cpuSet)
     if (!hwloc_topology)
     {
         likwid_hwloc_topology_init(&hwloc_topology);
+#if HWLOC_API_VERSION > 0x00020000
+        likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM );
+#else
         likwid_hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM|HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+#endif
         likwid_hwloc_topology_load(hwloc_topology);
     }
     obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_SOCKET, 0);
@@ -247,6 +276,7 @@ hwloc_init_cpuInfo(cpu_set_t cpuSet)
         cpuid_info.isIntel = strcmp(info, "GenuineIntel") == 0;
     if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUStepping")))
         cpuid_info.stepping = atoi(info);
+    snprintf(cpuid_info.architecture, 19, "x86_64");
 #endif
 #ifdef __ARM_ARCH_7A__
     if ((info = hwloc_obj_get_info_by_name(obj, "CPUArchitecture")))
@@ -256,19 +286,59 @@ hwloc_init_cpuInfo(cpu_set_t cpuSet)
     if (cpuid_info.family == 0 || cpuid_info.model == 0)
     {
         uint32_t part = 0;
-        parse_cpuinfo(&cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor);
+        uint32_t count = 0;
+        parse_cpuinfo(&count, &cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor);
         parse_cpuname(cpuid_info.osname);
     }
+    snprintf(cpuid_info.architecture, 19, "armv7");
 #endif
 #ifdef __ARM_ARCH_8A
     uint32_t part = 0;
-    parse_cpuinfo(&cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor);
+    uint32_t count = 0;
+    parse_cpuinfo(&count, &cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor);
     parse_cpuname(cpuid_info.osname);
+    snprintf(cpuid_info.architecture, 19, "armv8");
 #endif
+
+#ifndef _ARCH_PPC
     if ((info = hwloc_obj_get_info_by_name(obj, "CPUModel")))
         strcpy(cpuid_info.osname, info);
+#else
+    if ((info = likwid_hwloc_obj_get_info_by_name(obj, "CPUModel")))
+    {
+        if (strstr(info, "POWER7") != NULL)
+        {
+            cpuid_info.model = POWER7;
+            cpuid_info.family = PPC_FAMILY;
+            cpuid_info.isIntel = 0;
+            strcpy(cpuid_info.osname, info);
+            cpuid_info.stepping = 0;
+        }
+        if (strstr(info, "POWER8") != NULL)
+        {
+            cpuid_info.model = POWER8;
+            cpuid_info.family = PPC_FAMILY;
+            cpuid_info.isIntel = 0;
+            strcpy(cpuid_info.osname, info);
+            cpuid_info.stepping = 0;
+        }
+        if (strstr(info, "POWER9") != NULL)
+        {
+            cpuid_info.model = POWER9;
+            cpuid_info.family = PPC_FAMILY;
+            cpuid_info.isIntel = 0;
+            strcpy(cpuid_info.osname, info);
+            cpuid_info.stepping = 0;
+        }
+    }
+#endif
+
 
     cpuid_topology.numHWThreads = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A)
+    if (count > cpuid_topology.numHWThreads)
+        cpuid_topology.numHWThreads = count;
+#endif
     if (cpuid_topology.activeHWThreads > cpuid_topology.numHWThreads)
         cpuid_topology.numHWThreads = cpuid_topology.activeHWThreads;
     DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC CpuInfo Family %d Model %d Stepping %d Vendor 0x%X Part 0x%X isIntel %d numHWThreads %d activeHWThreads %d,
@@ -297,14 +367,22 @@ hwloc_init_nodeTopology(cpu_set_t cpuSet)
     int nr_sockets = 1;
     int id = 0;
     int consecutive_cores = -1;
+    int from_file = (getenv("HWLOC_FSROOT") != NULL);
     hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
-    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    if (!from_file)
     {
-        if (CPU_ISSET(i, &cpuSet))
+        for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
         {
-            poolsize = i+1;
+            if (CPU_ISSET(i, &cpuSet))
+            {
+                poolsize = i+1;
+            }
         }
     }
+    else
+    {
+        poolsize = cpuid_topology.numHWThreads;
+    }
     hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
     for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
     {
@@ -346,7 +424,13 @@ hwloc_init_nodeTopology(cpu_set_t cpuSet)
         }
         id = obj->os_index;
         if (CPU_ISSET(id, &cpuSet))
+        {
+            hwThreadPool[id].inCpuSet = 1;
+        }
+        else if (from_file)
+        {
             hwThreadPool[id].inCpuSet = 1;
+        }
         hwThreadPool[id].apicId = obj->os_index;
         hwThreadPool[id].threadId = obj->sibling_rank;
         if (maxNumLogicalProcsPerCore > 1)
@@ -469,6 +553,7 @@ hwloc_init_nodeTopology(cpu_set_t cpuSet)
 
 void hwloc_split_llc_check(CacheLevel* llc_cache)
 {
+    int i = 0;
     hwloc_obj_t obj = NULL;
     int num_sockets = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_SOCKET);
     int num_nodes = likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_NODE);
@@ -478,8 +563,24 @@ void hwloc_split_llc_check(CacheLevel* llc_cache)
     }
     obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_SOCKET, 0);
     int num_threads_per_socket = likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+    if (num_threads_per_socket == 0)
+    {
+        for (i = 0; i < likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU); i++)
+        {
+            if (hwloc_bitmap_isset(obj->cpuset, i))
+                num_threads_per_socket++;
+        }
+    }
     obj = likwid_hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_NODE, 0);
     int num_threads_per_node = likwid_hwloc_record_objs_of_type_below_obj(hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+    if (num_threads_per_node == 0)
+    {
+        for (i = 0; i < likwid_hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU); i++)
+        {
+            if (hwloc_bitmap_isset(obj->cpuset, i))
+                num_threads_per_node++;
+        }
+    }
     if (num_threads_per_node < num_threads_per_socket)
     {
         llc_cache->threads = num_threads_per_node;
@@ -505,17 +606,42 @@ hwloc_init_cacheTopology(void)
     depth = likwid_hwloc_topology_get_depth(hwloc_topology);
     for (d = 0; d < depth; d++)
     {
+#if HWLOC_API_VERSION > 0x00020000
+        hwloc_obj_type_t depth_type = likwid_hwloc_get_depth_type(hwloc_topology, d);
+        if (depth_type == HWLOC_OBJ_L1CACHE ||
+            depth_type == HWLOC_OBJ_L2CACHE ||
+            depth_type == HWLOC_OBJ_L3CACHE ||
+            depth_type == HWLOC_OBJ_L4CACHE ||
+            depth_type == HWLOC_OBJ_L5CACHE)
+            maxNumLevels++;
+#else
         if (likwid_hwloc_get_depth_type(hwloc_topology, d) == HWLOC_OBJ_CACHE)
             maxNumLevels++;
+#endif
     }
     cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+    if (!cachePool)
+    {
+        cpuid_topology.numCacheLevels = 0;
+        cpuid_topology.cacheLevels = NULL;
+        return;
+    }
     /* Start at the bottom of the tree to get all cache levels in order */
     depth = likwid_hwloc_topology_get_depth(hwloc_topology);
     id = 0;
     for(d=depth-1;d >= 0; d--)
     {
         /* We only need caches, so skip other levels */
-        if (likwid_hwloc_get_depth_type(hwloc_topology, d) != HWLOC_OBJ_CACHE)
+#if HWLOC_API_VERSION > 0x00020000
+        hwloc_obj_type_t depth_type = likwid_hwloc_get_depth_type(hwloc_topology, d);
+        if (depth_type != HWLOC_OBJ_L1CACHE &&
+            depth_type != HWLOC_OBJ_L2CACHE &&
+            depth_type != HWLOC_OBJ_L3CACHE &&
+            depth_type != HWLOC_OBJ_L4CACHE &&
+            depth_type != HWLOC_OBJ_L5CACHE)
+#else
+        if (likwid_hwloc_get_depth_type(hwloc_topology, d) < HWLOC_OBJ_CACHE)
+#endif
         {
             continue;
         }
@@ -551,6 +677,15 @@ hwloc_init_cacheTopology(void)
         cachePool[id].lineSize = obj->attr->cache.linesize;
         cachePool[id].size = obj->attr->cache.size;
         cachePool[id].sets = 0;
+#ifdef _ARCH_PPC
+        if ((cpuid_info.family == PPC_FAMILY) && ((cpuid_info.model == POWER8) || (cpuid_info.model == POWER9)))
+        {
+            if (cachePool[id].lineSize == 0)
+                cachePool[id].lineSize = 128;
+            if (cachePool[id].associativity == 0)
+                cachePool[id].associativity = 8;
+        }
+#endif
         if ((cachePool[id].associativity * cachePool[id].lineSize) != 0)
         {
             cachePool[id].sets = cachePool[id].size /
@@ -591,9 +726,10 @@ hwloc_init_cacheTopology(void)
 #if defined(_ARCH_PPC) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A)
         cachePool[id].inclusive = 0;
 #endif
-        DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Cache Pool ID %d Level %d Size %d,
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Cache Pool ID %d Level %d Size %d Threads %d,
                                       id, cachePool[id].level,
-                                      cachePool[id].size);
+                                      cachePool[id].size,
+                                      cachePool[id].threads);
         id++;
     }
 
diff --git a/src/topology_proc.c b/src/topology_proc.c
index bd3b2e118..e0e756437 100644
--- a/src/topology_proc.c
+++ b/src/topology_proc.c
@@ -41,6 +41,9 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <bstrlib.h>
+#include <bstrlib_helper.h>
+
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
 static int
@@ -68,7 +71,7 @@ get_cpu_perf_data(void)
 
         eax = 0x06;
         CPUID(eax, ebx, ecx, edx);
-        if (eax & (1<<1))
+        if (eax & (1ULL<<1))
         {
             cpuid_info.turbo = 1;
         }
@@ -162,7 +165,7 @@ static int readCacheInclusiveAMD(int level)
     eax = 0x8000001D;
     ecx = level;
     CPUID(eax, ebx, ecx, edx);
-    return (edx & (0x1<<1));
+    return (edx & (0x1ULL<<1));
 }
 #else
 static int readCacheInclusiveIntel(int level)
@@ -189,20 +192,31 @@ proc_init_cpuInfo(cpu_set_t cpuSet)
     ownatoi = &atoi;
     ownstrcpy = &strcpy;
 
-    const_bstring countString = bformat("processor\t:");
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(__x86_64)
     const_bstring modelString = bformat("model\t\t:");
-    const_bstring familyString = bformat("cpu family\t:");
     const_bstring steppingString = bformat("stepping\t:");
+    const_bstring nameString = bformat("model name\t:");
+#endif
+#ifdef _ARCH_PPC
+    const_bstring modelString = bformat("cpu\t\t:");
+    const_bstring steppingString = bformat("revision\t:");
+    const_bstring nameString = bformat("machine\t\t:");
+#endif
+    const_bstring familyString = bformat("cpu family\t:");
+    const_bstring countString = bformat("processor\t:");
     const_bstring vendorString = bformat("vendor_id\t:");
     const_bstring vendorIntelString = bformat("GenuineIntel");
-    const_bstring nameString = bformat("model name\t:");
 
     cpuid_info.isIntel = 0;
     cpuid_info.model = 0;
     cpuid_info.family = 0;
+#ifdef _ARCH_PPC
+    cpuid_info.family = PPC_FAMILY;
+#endif
     cpuid_info.stepping = 0;
     cpuid_topology.numHWThreads = 0;
     cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+    cpuid_info.osname[0] = '\0';
 
     if (NULL != (fp = fopen ("/proc/cpuinfo", "r")))
     {
@@ -212,16 +226,35 @@ proc_init_cpuInfo(cpu_set_t cpuSet)
         fclose(fp);
         for (i=0;i<tokens->qty;i++)
         {
+            printf("%d\n", binstr(tokens->entry[i],0,modelString));
             if (binstr(tokens->entry[i],0,countString) != BSTR_ERR)
             {
                 HWthreads++;
             }
             else if ((cpuid_info.model == 0) && (binstr(tokens->entry[i],0,modelString) != BSTR_ERR))
             {
+#ifndef _ARCH_PPC
                 struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
                 bltrimws(subtokens->entry[1]);
                 cpuid_info.model = ownatoi(bdata(subtokens->entry[1]));
                 bstrListDestroy(subtokens);
+#else
+		const_bstring power7str = bformat("POWER7");
+		const_bstring power8str = bformat("POWER8");
+		const_bstring power9str = bformat("POWER9");
+		if (binstr(tokens->entry[i],0, power7str) != BSTR_ERR)
+		{
+			cpuid_info.model = POWER7;
+		}
+		else if (binstr(tokens->entry[i],0, power8str) != BSTR_ERR)
+                {
+                        cpuid_info.model = POWER8;
+                }
+		else if (binstr(tokens->entry[i],0, power9str) != BSTR_ERR)
+                {
+                        cpuid_info.model = POWER9;
+                }
+#endif
             }
             else if ((cpuid_info.family == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
             {
@@ -241,6 +274,7 @@ proc_init_cpuInfo(cpu_set_t cpuSet)
             {
                 struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
                 bltrimws(subtokens->entry[1]);
+
                 ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
                 bstrListDestroy(subtokens);
             }
@@ -257,6 +291,15 @@ proc_init_cpuInfo(cpu_set_t cpuSet)
         }
         bstrListDestroy(tokens);
         cpuid_topology.numHWThreads = HWthreads;
+#ifdef __x86_64
+        snprintf(cpuid_info.architecture, 19, "x86_64");
+#endif
+#ifdef __ARM_ARCH_7A__
+        snprintf(cpuid_info.architecture, 19, "armv7");
+#endif
+#ifdef __ARM_ARCH_8A
+        snprintf(cpuid_info.architecture, 19, "armv8");
+#endif
         DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d,
                             cpuid_info.family,
                             cpuid_info.model,
@@ -267,217 +310,249 @@ proc_init_cpuInfo(cpu_set_t cpuSet)
     return;
 }
 
+
 void
 proc_init_cpuFeatures(void)
 {
-    int ret;
+    int ret = 0;
     FILE* file;
     char buf[1024];
     char ident[30];
     char delimiter[] = " ";
     char* cptr;
+#ifdef _ARCH_PPC
+    return;
+#endif
+    const_bstring flagString = bformat("flags");
+    const_bstring featString = bformat("Features");
+    bstring flagline = bfromcstr("");
 
-    if ( (file = fopen( "/proc/cpuinfo", "r")) == NULL )
+    bstring cpuinfo = read_file("/proc/cpuinfo");
+    struct bstrList* cpulines = bsplit(cpuinfo, '\n');
+    bdestroy(cpuinfo);
+    for (int i = 0; i < cpulines->qty; i++)
     {
-        fprintf(stderr, "Cannot open /proc/cpuinfo\n");
-        return;
-    }
-    ret = 0;
-    while( fgets(buf, sizeof(buf)-1, file) )
-    {
-        ret = sscanf(buf, "%s\t:", &(ident[0]));
 #if defined(__x86_64__) || defined(__i386__)
-        if (ret != 1 || strcmp(ident,"flags") != 0 || strcmp(ident, "Features") != 0)
+        if (bstrncmp(cpulines->entry[i], flagString, 5) == BSTR_OK ||
+            bstrncmp(cpulines->entry[i], featString, 8) == BSTR_OK)
 #endif
 #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)
-        if (ret != 1 || strcmp(ident, "Features") != 0)
+        if (bstrncmp(cpulines->entry[i], featString, 8) == BSTR_OK)
+#endif
+#ifdef _ARCH_PPC
+	if (ret != 1)
 #endif
         {
-            continue;
-        }
-        else
-        {
-            ret = 1;
+            bdestroy(flagline);
+            flagline = bstrcpy(cpulines->entry[i]);
             break;
         }
     }
-    fclose(file);
-    if (ret == 0)
-    {
-        return;
-    }
+    bstrListDestroy(cpulines);
+
+    struct bstrList* flaglist = bsplit(flagline, ' ');
+    bstring bfeatures = bfromcstr("");
 
     cpuid_info.featureFlags = 0;
-    cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
-    cpuid_info.features[0] = '\0';
-    buf[strcspn(buf, "\n")] = '\0';
-    cptr = strtok(&(buf[6]),delimiter);
 
-    while (cptr != NULL)
+    for (int i = 1; i < flaglist->qty; i++)
     {
-        if (strcmp(cptr,"ssse3") == 0)
+        if (bisstemeqblk(flaglist->entry[i], "sse4_1", 6) == 1)
+        {
+            setBit(cpuid_info.featureFlags, SSE41);
+            bcatcstr(bfeatures, "SSE4.1 ");
+        }
+        else if (bisstemeqblk(flaglist->entry[i], "sse4_2", 6) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSSE3);
-            strcat(cpuid_info.features, "SSSE3 ");
+            setBit(cpuid_info.featureFlags, SSE42);
+            bcatcstr(bfeatures, "SSE4.2 ");
         }
-        else if (strcmp(cptr,"sse3") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "sse4a", 6) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSE3);
-            strcat(cpuid_info.features, "SSE3 ");
+            setBit(cpuid_info.featureFlags, SSE4A);
+            bcatcstr(bfeatures, "SSE4a ");
         }
-        else if (strcmp(cptr,"monitor") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "ssse3", 5) == 1)
         {
-            cpuid_info.featureFlags |= (1<<MONITOR);
-            strcat(cpuid_info.features, "MONITOR ");
+            setBit(cpuid_info.featureFlags, SSSE3);
+            bcatcstr(bfeatures, "SSSE ");
         }
-        else if (strcmp(cptr,"mmx") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "sse3", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<MMX);
-            strcat(cpuid_info.features, "MMX ");
+            setBit(cpuid_info.featureFlags, SSE3);
+            bcatcstr(bfeatures, "SSE3 ");
         }
-        else if (strcmp(cptr,"sse") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "sse2", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSE);
-            strcat(cpuid_info.features, "SSE ");
+            setBit(cpuid_info.featureFlags, SSE2);
+            bcatcstr(bfeatures, "SSE2 ");
         }
-        else if (strcmp(cptr,"sse2") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "monitor", 7) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSE2);
-            strcat(cpuid_info.features, "SSE2 ");
+            setBit(cpuid_info.featureFlags, MONITOR);
+            bcatcstr(bfeatures, "MONITOR ");
         }
-        else if (strcmp(cptr,"acpi") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "mmx", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<ACPI);
-            strcat(cpuid_info.features, "ACPI ");
+            setBit(cpuid_info.featureFlags, MMX);
+            bcatcstr(bfeatures, "MMX ");
         }
-        else if (strcmp(cptr,"rdtscp") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "sse", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<RDTSCP);
-            strcat(cpuid_info.features, "RDTSCP ");
+            setBit(cpuid_info.featureFlags, SSE);
+            bcatcstr(bfeatures, "SSE ");
         }
-        else if (strcmp(cptr,"vmx") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "acpi", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<VMX);
-            strcat(cpuid_info.features, "VMX ");
+            setBit(cpuid_info.featureFlags, ACPI);
+            bcatcstr(bfeatures, "ACPI ");
         }
-        else if (strcmp(cptr,"est") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "rdtscp", 6) == 1)
         {
-            cpuid_info.featureFlags |= (1<<EIST);
-            strcat(cpuid_info.features, "EIST ");
+            setBit(cpuid_info.featureFlags, RDTSCP);
+            bcatcstr(bfeatures, "RDTSCP ");
         }
-        else if (strcmp(cptr,"tm") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "vmx", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<TM);
-            strcat(cpuid_info.features, "TM ");
+            setBit(cpuid_info.featureFlags, VMX);
+            bcatcstr(bfeatures, "VMX ");
         }
-        else if (strcmp(cptr,"tm2") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "est", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<TM2);
-            strcat(cpuid_info.features, "TM2 ");
+            setBit(cpuid_info.featureFlags, EIST);
+            bcatcstr(bfeatures, "EIST ");
         }
-        else if (strcmp(cptr,"aes") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "tm2", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<AES);
-            strcat(cpuid_info.features, "AES ");
+            setBit(cpuid_info.featureFlags, TM2);
+            bcatcstr(bfeatures, "TM2 ");
         }
-        else if (strcmp(cptr,"rdrand") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "tm", 2) == 1)
         {
-            cpuid_info.featureFlags |= (1<<RDRAND);
-            strcat(cpuid_info.features, "RDRAND ");
+            setBit(cpuid_info.featureFlags, TM);
+            bcatcstr(bfeatures, "TM ");
         }
-        else if (strcmp(cptr,"sse4_1") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "aes", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSE41);
-            strcat(cpuid_info.features, "SSE4.1 ");
+            setBit(cpuid_info.featureFlags, AES);
+            bcatcstr(bfeatures, "AES ");
         }
-        else if (strcmp(cptr,"sse4_2") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "rdrand", 6) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SSE42);
-            strcat(cpuid_info.features, "SSE4.2 ");
+            setBit(cpuid_info.featureFlags, RDRAND);
+            bcatcstr(bfeatures, "RDRAND ");
         }
-        else if (strcmp(cptr,"avx") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "rdseed", 6) == 1)
         {
-            cpuid_info.featureFlags |= (1<<AVX);
-            strcat(cpuid_info.features, "AVX ");
+            setBit(cpuid_info.featureFlags, RDSEED);
+            bcatcstr(bfeatures, "RDSEED ");
         }
-        else if (strcmp(cptr,"fma") == 0)
+        else if ((bisstemeqblk(flaglist->entry[i], "avx512", 6) == 1) && (!testBit(cpuid_info.featureFlags, AVX512)))
         {
-            cpuid_info.featureFlags |= (1<<FMA);
-            strcat(cpuid_info.features, "FMA ");
+            setBit(cpuid_info.featureFlags, AVX512);
+            bcatcstr(bfeatures, "AVX512 ");
         }
-        else if (strcmp(cptr,"avx2") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "avx2", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<AVX2);
-            strcat(cpuid_info.features, "AVX2 ");
+            setBit(cpuid_info.featureFlags, AVX2);
+            bcatcstr(bfeatures, "AVX2 ");
         }
-        else if (strcmp(cptr,"rtm") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "avx", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<RTM);
-            strcat(cpuid_info.features, "RTM ");
+            setBit(cpuid_info.featureFlags, AVX);
+            bcatcstr(bfeatures, "AVX ");
         }
-        else if (strcmp(cptr,"hle") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "fma", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<HLE);
-            strcat(cpuid_info.features, "HLE ");
+            setBit(cpuid_info.featureFlags, FMA);
+            bcatcstr(bfeatures, "FMA ");
         }
-        else if (strcmp(cptr,"rdseed") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "rtm", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<RDSEED);
-            strcat(cpuid_info.features, "RDSEED ");
+            setBit(cpuid_info.featureFlags, RTM);
+            bcatcstr(bfeatures, "RTM ");
         }
-        else if (strcmp(cptr,"ht") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "hle", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<HTT);
-            strcat(cpuid_info.features, "HTT ");
+            setBit(cpuid_info.featureFlags, HLE);
+            bcatcstr(bfeatures, "HLE ");
         }
-        else if (strncmp(cptr,"avx512", 6) == 0 && !(cpuid_info.featureFlags & (1<<AVX512)))
+        else if (bisstemeqblk(flaglist->entry[i], "ht", 2) == 1)
         {
-            cpuid_info.featureFlags |= (1<<AVX512);
-            strcat(cpuid_info.features, "AVX512 ");
+            setBit(cpuid_info.featureFlags, HTT);
+            bcatcstr(bfeatures, "HTT ");
         }
-        else if (strcmp(cptr,"swp") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "fp", 2) == 1)
         {
-            cpuid_info.featureFlags |= (1<<SWP);
-            strcat(cpuid_info.features, "SWP ");
+            setBit(cpuid_info.featureFlags, FP);
+            bcatcstr(bfeatures, "FP ");
         }
-        else if (strcmp(cptr,"neon") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "swp", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<NEON);
-            strcat(cpuid_info.features, "NEON ");
+            setBit(cpuid_info.featureFlags, SWP);
+            bcatcstr(bfeatures, "SWP ");
         }
-        else if (strcmp(cptr,"vfp") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "vfpv3", 5) == 1)
         {
-            cpuid_info.featureFlags |= (1<<VFP);
-            strcat(cpuid_info.features, "VFP ");
+            setBit(cpuid_info.featureFlags, VFPV3);
+            bcatcstr(bfeatures, "VFPV3 ");
         }
-        else if (strcmp(cptr,"vfpv3") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "vfpv4", 5) == 1)
         {
-            cpuid_info.featureFlags |= (1<<VFPV3);
-            strcat(cpuid_info.features, "VFPv3 ");
+            setBit(cpuid_info.featureFlags, VFPV4);
+            bcatcstr(bfeatures, "VFPV4 ");
         }
-        else if (strcmp(cptr,"vfpv4") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "vfp", 3) == 1)
         {
-            cpuid_info.featureFlags |= (1<<VFPV4);
-            strcat(cpuid_info.features, "VFPv4 ");
+            setBit(cpuid_info.featureFlags, VFP);
+            bcatcstr(bfeatures, "VFP ");
         }
-        else if (strcmp(cptr,"edsp") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "neon", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<EDSP);
-            strcat(cpuid_info.features, "EDSP ");
+            setBit(cpuid_info.featureFlags, NEON);
+            bcatcstr(bfeatures, "NEON ");
         }
-        else if (strcmp(cptr,"tls") == 0)
+        else if (bisstemeqblk(flaglist->entry[i], "edsp", 4) == 1)
         {
-            cpuid_info.featureFlags |= (1<<TLS);
-            strcat(cpuid_info.features, "TLS ");
+            setBit(cpuid_info.featureFlags, EDSP);
+            bcatcstr(bfeatures, "EDSP ");
         }
-        cptr = strtok(NULL, delimiter);
+        else if (bisstemeqblk(flaglist->entry[i], "tls", 3) == 1)
+        {
+            setBit(cpuid_info.featureFlags, TLS);
+            bcatcstr(bfeatures, "TLS ");
+        }
+        else if (bisstemeqblk(flaglist->entry[i], "asimdrdm", 8) == 1)
+        {
+            setBit(cpuid_info.featureFlags, ASIMDRDM);
+            bcatcstr(bfeatures, "ASIMDRDM ");
+        }
+        else if (bisstemeqblk(flaglist->entry[i], "asimd", 5) == 1)
+        {
+            setBit(cpuid_info.featureFlags, ASIMD);
+            bcatcstr(bfeatures, "ASIMD ");
+        }
+        else if (bisstemeqblk(flaglist->entry[i], "pmull", 5) == 1)
+        {
+            setBit(cpuid_info.featureFlags, PMULL);
+            bcatcstr(bfeatures, "PMULL ");
+        }
+    }
+
+    if (testBit(cpuid_info.featureFlags, SSSE3) && !testBit(cpuid_info.featureFlags, SSE3))
+    {
+        setBit(cpuid_info.featureFlags, SSE3);
+        bcatcstr(bfeatures, "SSE3 ");
     }
 
-    if ((cpuid_info.featureFlags & (1<<SSSE3)) && !((cpuid_info.featureFlags) & (1<<SSE3)))
+    cpuid_info.features = (char*) malloc((blength(bfeatures)+2)*sizeof(char));
+    ret = snprintf(cpuid_info.features, blength(bfeatures)+1, "%s", bdata(bfeatures));
+    if (ret > 0)
     {
-        cpuid_info.featureFlags |= (1<<SSE3);
-        strcat(cpuid_info.features, "SSE3 ");
+        cpuid_info.features[ret] = '\0';
     }
+    bdestroy(bfeatures);
+    bstrListDestroy(flaglist);
 
     get_cpu_perf_data();
     return;
@@ -788,6 +863,7 @@ proc_init_cacheTopology(void)
                 break;
             case ARMV8_FAMILY:
             case ARMV7_FAMILY:
+	    case PPC_FAMILY:
                 cachePool[i].inclusive = 0;
                 break;
             default:
@@ -800,4 +876,3 @@ proc_init_cacheTopology(void)
     cpuid_topology.cacheLevels = cachePool;
     return;
 }
-
diff --git a/test/MPI_pin_test.c b/test/MPI_pin_test.c
index 2f86387c7..f1f12c99f 100644
--- a/test/MPI_pin_test.c
+++ b/test/MPI_pin_test.c
@@ -1,18 +1,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <mpi.h>
+
+#include <errno.h>
 #include <sys/types.h>
 #include <string.h>
 #include <sys/syscall.h>
 
+#include <mpi.h>
+
+
 #ifdef _OPENMP
 extern int omp_get_num_threads();
 extern int omp_get_thread_num();
 #endif
 
+#ifdef PTHREADS
+#include <pthread.h>
+#endif
+
 #include <sched.h>
 
+#define HOST_NAME_MAX 1024
+#define MASTER(msg) \
+    if (rank == 0)  printf(#msg "\n")
+#define gettid() (int)syscall(SYS_gettid)
+
 int get_cpu_id()
 {
     int i;
@@ -37,30 +50,92 @@ int get_cpu_id()
     return cpu_id;
 }
 
-#define HOST_NAME_MAX 1024
-#define MASTER(msg) \
-    if (rank == 0)  printf(#msg "\n")
-#define gettid() (int)syscall(SYS_gettid)
+
+int get_sched()
+{
+    int i = 0;
+    cpu_set_t my_set;
+    int nproc = sysconf(_SC_NPROCESSORS_ONLN);
+    CPU_ZERO(&my_set);
+    sched_getaffinity(gettid(), sizeof(cpu_set_t), &my_set);
+    for (i = 0; i < nproc; i++)
+    {
+        if (CPU_ISSET(i, &my_set))
+            return i;
+    }
+    return -1;
+}
+
+
+#ifdef PTHREADS
+struct thread_info {
+    int thread_id;
+    int mpi_id;
+    pid_t pid;
+    pid_t ppid;
+};
+
+void *
+thread_start(void *arg)
+{
+    
+    int i = 0;
+    struct thread_info *tinfo = arg;
+    char host[HOST_NAME_MAX+1];
+    if (host)
+    {
+        gethostname(host, HOST_NAME_MAX);
+    }
+    
+    printf ("Rank %d Thread %d running on Node %s core %d/%d with pid %d and tid %d\n",tinfo->mpi_id, tinfo->thread_id, host, sched_getcpu(), get_sched(), tinfo->pid ,gettid());
+    if (tinfo->thread_id == 0)
+    {
+        sleep(tinfo->mpi_id);
+        char cmd[1024];
+        pid_t pid = getppid();
+        snprintf(cmd, 1023, "pstree -p -H %d %d",pid, pid);
+/*        system(cmd);*/
+    }
+    
+    if (tinfo->thread_id != 0)
+        pthread_exit(NULL);
+}
+#endif
+
 
 main(int argc, char **argv)
 {
-    int rank;
-    char* host;
+    int i = 0;
+    int rank = 0, size = 1;
+    char host[HOST_NAME_MAX];
+    pid_t master_pid = getpid();
+
 
     MPI_Init(&argc,&argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    host = (char*) malloc(HOST_NAME_MAX * sizeof(char));
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
     gethostname(host, HOST_NAME_MAX);
 
     MASTER(MPI started);
     MPI_Barrier(MPI_COMM_WORLD);
-    printf("Process with rank %d running on Node %s Core %d/%d\n",rank ,host, sched_getcpu(),get_cpu_id());
+    printf("Process with rank %d running on Node %s core %d/%d with pid %d\n",rank ,host, sched_getcpu(),get_cpu_id(), master_pid);
     MPI_Barrier(MPI_COMM_WORLD);
 
+
+
+#ifdef _OPENMP
     MASTER(Enter OpenMP parallel region);
     MPI_Barrier(MPI_COMM_WORLD);
+    MASTER(Start OpenMP threads);
 #pragma omp parallel
     {
+#pragma omp barrier
+
+#pragma omp critical
+        {
+            printf ("Rank %d Thread %d running on Node %s core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), host, sched_getcpu(), get_sched(), master_pid ,gettid());
+        }
 #pragma omp master
         {
             pid_t pid = getppid();
@@ -68,15 +143,41 @@ main(int argc, char **argv)
             sprintf(cmd, "pstree -p -H %d %d",pid, pid);
             system(cmd);
         }
-#ifdef _OPENMP
-#pragma omp critical
-        {
-            printf ("Rank %d Thread %d running on Node %s core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), host, sched_getcpu(),get_cpu_id(), getpid(),gettid());
-        }
+    }
 #endif
 
+
+#ifdef PTHREADS
+    int err = 0;
+    struct thread_info tinfos[4];
+    pthread_t threads[4] = {0};
+    pthread_attr_t attrs[4];
+
+    pid_t pid = getppid();
+    for (i = 0; i < 4; i++)
+    {
+        tinfos[i].thread_id = i;
+        tinfos[i].mpi_id = rank;
+        tinfos[i].pid = master_pid;
+        tinfos[i].ppid = pid;
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    MASTER(Start Pthread threads);
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (i = 1; i < 4; i++)
+    {
+        err = pthread_create(&threads[i], &attrs[i], thread_start, (void*)&tinfos[i]);
+        if (err != 0) printf("pthread_create %d error: %s\n", i, strerror(err));
+    }
+    thread_start((void*)&tinfos[0]);
+
+    for (i = 1; i < 4; i++)
+    {
+        pthread_join(threads[i], NULL);
     }
+#endif
 
-    free(host);
     MPI_Finalize();
+
+    return 0;
 }
diff --git a/test/check_events_files.py b/test/check_data_files.py
similarity index 65%
rename from test/check_events_files.py
rename to test/check_data_files.py
index 7c3e5dd09..0253f3332 100755
--- a/test/check_events_files.py
+++ b/test/check_data_files.py
@@ -3,7 +3,7 @@
 
 # =======================================================================================
 #
-#      Filename:  check_events_files.py
+#      Filename:  check_data_files.py
 #
 #      Description:  Basic checks for performance event list files
 #
@@ -25,9 +25,12 @@
 #      this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 # =======================================================================================
+
 from collections import OrderedDict
 import json
+import logging as log
 from pathlib import Path
+from pprint import pprint, pformat
 import sys
 
 from pyparsing import *
@@ -36,7 +39,7 @@
 
 
 def err(fmt, *args, **kw):
-    print(fmt.format(*args, **kw), file=sys.stderr)
+    log.error(fmt.format(*args, **kw))
 
 
 
@@ -389,25 +392,266 @@ def resolve_events_files(opts):
     return files
 
 
+
 def check_events(opts):
 
+    start_logging(args)
     files = resolve_events_files(opts)
+    if opts.json:
+        opts.output_dir.mkdir(parents=True, exist_ok=True)
     ep = EventParser()
     nerrors = 0
     for fn in files:
-        if opts.verbose:
-            print("checking: {}".format(fn))
+        log.info("checking: {}".format(fn))
         with fn.open() as f:
             nerrors += ep.check(f, opts)
         if opts.json:
             o = opts.output_dir / fn.name.replace(fn.suffix, '.json')
-            ep.to_json(o.open('w'), indent=4)
-    if opts.verbose and nerrors:
-        print("\nFound {} errors".format(nerrors))
+            with o.open('w') as jf:
+                ep.to_json(jf, indent=4)
+    if nerrors:
+        log.info("\nFound {} errors".format(nerrors))
+    return nerrors > 0
+
+
+
+UNITS = [
+    's',
+    'MHz',
+    'MFLOP/s',
+    'MUOPS/s',
+    'MBytes/s',
+    'GBytes',
+    '%',
+    'J',
+    'W',
+    'C',
+    ]
+
+def miss_brackets_in_units(s):
+
+    for dim in UNITS:
+        if ' {} '.format(dim) in s:
+            return dim
+        if s.endswith(' {}'.format(dim)):
+            return dim
+    return None
+
+
+
+def extract_units(s):
+
+    l = s.split()
+    if len(l) > 1 and l[-1].startswith('[') and l[-1].endswith(']'):
+        return ' '.join(l[:-1]), l[-1][1:-1]
+    else:
+        return s, None
+
+
+
+class GroupParser():
+    """
+    eol = '\n' ;
+    letter = 'A' - 'Z' | 'a' - 'z' ;
+    digit = '0' - '9'
+    name = letter, { letter | digit | '_' } ;
+    word = -eol, { -eol } ;
+    line = word, { word }, eol ;
+    op = '*' | '/' | '+' | '-' ;
+
+    short = "SHORT", short_title, eol ;
+    noht = "REQUIRE_NOHT", eol ;
+    counter_name = name ;
+    event_name = name ;
+    event = counter_name, event_name, eol ;
+    eventset = "EVENTSET", eol, event, { event }, eol ;
+    metric_name = letter, { letter | digit | '_' | '(' | ')' | ' ' } ;
+    units = UNITS
+    metric = metric_name, [ units ], counter_name, {  op, counter_name }, eol ;
+    metrics = "METRICS", eol, metric, { metric }, eol ;
+    formula = metric_name, [ units ], '=', event_name, {  op,  event_name}, eol ;
+    formulas = "Formulas:", eol, formula, { formula } ;
+    sep = '--' | '-', eol ;
+    long = line, { line } ;
+    group = short, [ noht ], eventset, metrics, "LONG", eol, formulas, sep, long
+    """
+
+    def __init__(self):
+
+        EOL = LineEnd().suppress()
+        Line = LineStart() + SkipTo(LineEnd(), failOn=LineStart()+LineEnd()) + EOL
+
+        short = Keyword('SHORT').suppress() + SkipTo(LineEnd(), failOn=LineStart()+LineEnd()) + EOL
+        short.setParseAction(
+            lambda s, loc, toks:
+                self.group.update(short=toks[0])
+        )
+        self.parser = short + OneOrMore(EOL)
+
+        noht = Keyword('REQUIRE_NOHT')
+        noht.setParseAction(
+            lambda s, loc, toks:
+                self.group.update(require_noht=True)
+        )
+        self.parser += Optional(noht() + OneOrMore(EOL))
+
+        eventset = Keyword('EVENTSET').suppress() + EOL + \
+        Group(OneOrMore(
+            LineStart() + Group(Word(alphanums) + SkipTo(LineEnd(), failOn=LineStart()+LineEnd())
+                            ).setParseAction(self.add_event) + EOL
+        ))
+        self.parser += eventset + OneOrMore(EOL)
+
+        metrics = Keyword('METRICS').suppress() + EOL + \
+        Group(OneOrMore(
+            LineStart() + Group(OneOrMore(Word(alphanums+'.[]()-+*/')) + SkipTo(LineEnd(), failOn=LineStart()+LineEnd())
+                            ).setParseAction(self.add_metric) + EOL
+        ))
+        self.parser += metrics + OneOrMore(EOL)
+
+        long = Keyword('LONG').suppress() + EOL
+        self.parser += long
+
+        formulae = Keyword('Formulas:') + EOL + OneOrMore(Line().setParseAction(self.add_formula))
+        self.parser += formulae
+
+        descr = (Keyword('-') ^ Keyword('--')).suppress() + EOL + Group(OneOrMore(Line()))
+        descr.setParseAction(
+            lambda s, loc, toks:
+                self.group.update(long=' '.join(toks[0]))
+        )
+        self.parser += descr
+
+
+    def add_event(self, s, loc, toks):
+        log.debug("add_event: |{}|".format(toks[0]))
+        self.group['events'].append(dict(counter=toks[0][0], event=toks[0][1]))
+
+
+    def add_metric(self, s, loc, toks):
+        log.debug("add_metric: |{}|".format(toks[0]))
+        metric = ' '.join(toks[0][0:-2])
+        dim = miss_brackets_in_units(metric)
+        if dim:
+            raise ParseSyntaxException(s, loc, "expected '[{}]', found '{}'".format(dim, dim))
+        metric, units = extract_units(metric)
+        formula = toks[0][-2]
+        self.group['metrics'].append(dict(metric=metric, formula=formula, units=units))
+
+
+    def add_formula(self, s, loc, toks):
+        if not '=' in toks[0]:
+            raise ParseException(s,loc,"expected '=' sign")
+        metric, formula = [ x.strip() for x in toks[0].split('=', 1) ]
+        dim = miss_brackets_in_units(metric)
+        if dim:
+            raise ParseSyntaxException(s, loc, "expected '[{}]', found '{}'".format(dim, dim))
+        metric, units = extract_units(metric)
+        log.debug("add_formula: |{}| = |{}|".format(metric, formula))
+        #if metric in self.group['formulae']:
+            #self.nerrors += 1
+            #err("\n{}:{}:\nDuplicate metric formula: {}", self.fname, self.line_num, token['name'])
+        self.group['formulae'].append(dict(metric=metric, formula=formula, units=units))
+
+
+    def reset(self, fname, opts):
+        self.fname = fname
+        self.opts = opts
+        self.nerrors = 0
+        self.group = dict(
+                          short=None,
+                          events=[],
+                          metrics=[],
+                          formulae=[],
+                          long=[]
+                          )
+        self.nerrors = 0
+
+
+    def check(self, afile, opts):
+        self.reset(afile.name, opts)
+        try:
+            res =self.parser.parseFile(afile)
+            log.debug(pformat(self.group))
+            log.debug('\ngroup=\n{}'.format(json.dumps(self.group, indent=2)))
+        except Exception as pex:
+            self.nerrors += 1
+            err('\n{}:{}:{}:', self.fname, pex.lineno, pex.col)
+            err('{}:', pex.msg)
+            err('{}', pex.line)
+            err("{}^", '-'*(pex.col-1))
+        return self.nerrors
+
+
+    def to_json(self, f, indent=4, sort_keys=True):
+
+        json.dump(self.group, f, indent=indent, sort_keys=sort_keys)
+
+
+
+def resolve_group_files(opts):
+
+    if not opts.files:
+        files = list(opts.input_dir.glob("**/*.txt"))
+        if not files:
+            raise Exception("Couldn't find any group files in '{}', "
+                            "please use --input-dir option.\n"
+                            "Use `{} groups --help` for details."
+                            .format(opts.input_dir, sys.argv[0]))
+    else:
+        files = []
+        for f in opts.files:
+            f = Path(f).expanduser()
+            _files = []
+            if f.is_file():
+                _files.append(f.resolve())
+            elif (opts.input_dir / f).is_file():
+                _files.append((opts.input_dir / f).resolve())
+            elif (opts.input_dir / "{}.txt".format(f)).is_file():
+                _files.append((opts.input_dir / "{}.txt".format(f)).resolve())
+            elif (opts.input_dir / f).is_dir():
+                _files = [x.resolve() for x in (opts.input_dir / f).glob("*.txt")]
+            if not _files:
+                trials = [f, (opts.input_dir / f), opts.input_dir / "{}.txt".format(f), (opts.input_dir / f / '*.txt')]
+                raise Exception("Can't find data file:\n\t   '{}'".format("'\n\tor '".join([str(f) for f in trials])))
+            files.extend(_files)
+    return files
+
+
+
+def check_groups(opts):
+
+    start_logging(args)
+    files = resolve_group_files(opts)
+    gp = GroupParser()
+    nerrors = 0
+    for fn in files:
+        log.info("checking: {}".format(fn))
+        with fn.open() as f:
+            nerrors += gp.check(f, opts)
+        if opts.json:
+            o = opts.output_dir / fn.relative_to(opts.input_dir).with_suffix('.json')
+            o.parent.mkdir(parents=True, exist_ok=True)
+            with o.open('w') as jf:
+                gp.to_json(jf, indent=4)
+    if nerrors:
+        log.info("\nFound {} errors".format(nerrors))
     return nerrors > 0
 
 
 
+def start_logging(args):
+
+    log_levels = log.ERROR, log.INFO, log.DEBUG,
+    log_level = log_levels[min(args.verbose, len(log_levels) - 1)]
+    log.basicConfig(
+                    #filename=str(Path(__file__).with_suffix('.log')),
+                    #filemode='w',
+                    format='%(message)s',
+                    level=log_level)
+
+
+
 if __name__ == "__main__":
 
     import argparse
@@ -418,7 +662,6 @@ def check_events(opts):
     subparsers = ap.add_subparsers(title='subcommands',
                                         description="use '<subcmd> --help' for "
                                                     "help on subcommands",
-                                        #help='valid subcommands'
                                         )
 
     etest = subparsers.add_parser('events', help="test events data files")
@@ -442,7 +685,7 @@ def check_events(opts):
                 help="dump event descriptions in JSON format"
                 )
     etest.add_argument('--output-dir', '-o', type=Path, default=Path('.'),
-                help="path to the directory where to put output files, "
+                help="directory where to put output JSON files, "
                      "default: current directory"
                 )
     etest.add_argument('--naming', '-n', action='store_true', default=False,
@@ -451,9 +694,28 @@ def check_events(opts):
                      "have the name from the EVENT line as a prefix."
                 )
 
-    self_test = subparsers.add_parser('self', help="self test for the checker")
+    self_test = subparsers.add_parser('self', help="self test for the events checker")
     self_test.set_defaults(func=test_EventParser)
 
+    gtest = subparsers.add_parser('groups', help="test group ddescriptions")
+    gtest.set_defaults(func=check_groups)
+    gtest.add_argument('--verbose', '-v', action='count', default=0)
+    default = Path(__file__).parent / "../groups"
+    if default.exists():
+        default = default.resolve()
+    else:
+        default = Path(__file__).parent
+    gtest.add_argument('--input-dir', '-d', type=Path,
+                        default=default,
+                        help='path to the directory with groups data files, default: {}'.format(default))
+    gtest.add_argument('files', metavar='FILE', type=str, nargs='*',
+                    help='group data file to check')
+    gtest.add_argument('--json', '-j', action='store_true', default=False,
+                    help="dump group descriptions in JSON format")
+    gtest.add_argument('--output-dir', '-o', type=Path,
+                        default=Path('.'),
+                        help='directory where to put output JSON files, default: current directory')
+
     args = ap.parse_args()
 
     try:
diff --git a/test/jacobi-2D-5pt.c b/test/jacobi-2D-5pt.c
index 7f392ac44..9cc0415c9 100644
--- a/test/jacobi-2D-5pt.c
+++ b/test/jacobi-2D-5pt.c
@@ -4,7 +4,7 @@
 #include<sys/time.h>
 
 #ifdef LIKWID_PERFMON
-#include <likwid.h>
+#include <likwid-marker.h>
 #endif
 
 int main()
diff --git a/test/serial.c b/test/serial.c
index 45a37e071..660bae724 100644
--- a/test/serial.c
+++ b/test/serial.c
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-#include <likwid.h>
+#include <likwid-marker.h>
 
 int main(int argc, char* argv[])
 {
diff --git a/test/stream.c b/test/stream.c
index ea811a2ac..7bbf473ec 100644
--- a/test/stream.c
+++ b/test/stream.c
@@ -17,7 +17,7 @@
 #define SIZE 40000000
 
 #define gettid() syscall(SYS_gettid)
-#include <likwid.h>
+#include <likwid-marker.h>
 #define HLINE "-------------------------------------------------------------\n"
 
 #ifndef MIN
@@ -246,4 +246,3 @@ int main(int argn, char** argc)
     free(d);
     return 0;
 }
-
diff --git a/test/stream.cc b/test/stream.cc
index 489bb092a..59f469a02 100644
--- a/test/stream.cc
+++ b/test/stream.cc
@@ -4,7 +4,7 @@
 #include <string>
 #include <atomic>
 #include <thread>
-#include <likwid.h>
+#include <likwid-marker.h>
 #include <sched.h>
 #include <syscall.h>
 #include <sys/time.h>
diff --git a/test/stream_cilk.c b/test/stream_cilk.c
index fcbe1a70c..60993af18 100644
--- a/test/stream_cilk.c
+++ b/test/stream_cilk.c
@@ -17,7 +17,7 @@
 #define SIZE 40000000
 
 #define gettid() syscall(SYS_gettid)
-#include <likwid.h>
+#include <likwid-marker.h>
 #define HLINE "-------------------------------------------------------------\n"
 
 #ifndef MIN
@@ -196,7 +196,7 @@ int main(){
     }
     time_stop(&timer);
     triad_time = time_print(&timer)/(double)ITER;
-    
+
     printf("Processed %.1f Mbyte at copy benchmark in %.4f seconds: %.2f MByte/s\n",
                         1E-6*(2*SIZE*sizeof(double)),
                         copy_time,
diff --git a/test/testTBB.cc b/test/testTBB.cc
index 887400f19..88c622303 100644
--- a/test/testTBB.cc
+++ b/test/testTBB.cc
@@ -15,7 +15,7 @@
 
 // Added by Thomas Roehl
 #include <sched.h>
-#include <likwid.h>
+#include <likwid-marker.h>
 
 
 struct mytask {
@@ -23,10 +23,10 @@ struct mytask {
     :_n(n)
   {}
   void operator()() {
-    
+
     for (int i=0;i<10000000;++i) {}  // Deliberately run slow
     std::cerr << "[" << sched_getcpu() << "]";
-    
+
   }
   size_t _n;
 };
diff --git a/test/testmarker-cnt.c b/test/testmarker-cnt.c
index d25f38740..a5a7e14d2 100644
--- a/test/testmarker-cnt.c
+++ b/test/testmarker-cnt.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <omp.h>
 
-#include <likwid.h>
+#include <likwid-marker.h>
 
 #define SIZE 1000000
 
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
             LIKWID_MARKER_START(label);
             for (j = 0; j < counter * threadId; j++)
             {
-                for (i = 0; i < SIZE; i++) 
+                for (i = 0; i < SIZE; i++)
                 {
                     a[i] = b[i] + alpha * c[i];
                     sum += a[i];
diff --git a/test/testmarker-nested.c b/test/testmarker-nested.c
index 784623a6d..c25c9dccb 100644
--- a/test/testmarker-nested.c
+++ b/test/testmarker-nested.c
@@ -5,7 +5,7 @@
 #include <omp.h>
 #include <sched.h>
 
-#include <likwid.h>
+#include <likwid-marker.h>
 
 #define SIZE 1000000
 #define N1   1000
diff --git a/test/testmarker-omp.c b/test/testmarker-omp.c
index e5bc4dba8..55355771f 100644
--- a/test/testmarker-omp.c
+++ b/test/testmarker-omp.c
@@ -4,7 +4,7 @@
 #include <omp.h>
 #include <unistd.h>
 
-#include <likwid.h>
+#include <likwid-marker.h>
 
 #define SIZE 1000000
 
@@ -40,7 +40,7 @@ int main(int argc, char* argv[])
 
             LIKWID_MARKER_START("plain");
             for (int k = 0; k < (threadId+1); k++)  {
-                for (int i = 0; i < SIZE; i++) 
+                for (int i = 0; i < SIZE; i++)
                 {
                     a[i] = b[i] + alpha * c[i];
                     sum += a[i];