From 699ba6ca46b74eb17e34ccda510901d140d54b63 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Sat, 9 Nov 2019 23:32:03 +0100 Subject: [PATCH] Update Nvidia GPU backend to working state --- Makefile | 2 + config.mk | 14 +- make/config_checks.mk | 4 +- src/applications/likwid-perfctr.lua | 214 +++-- src/applications/likwid-topology.lua | 45 +- src/applications/likwid.lua | 1 + src/cpustring.c | 4 + src/includes/likwid-gpumarker.h | 2 +- src/includes/likwid.h | 9 +- src/includes/nvmon_cupti.h | 1000 +++++++++++++++++++++ src/includes/nvmon_nvml.h | 50 ++ src/includes/nvmon_perfworks.h | 274 ++++++ src/includes/nvmon_types.h | 24 + src/luawid.c | 96 +- src/nvmon.c | 1208 ++++---------------------- src/topology_gpu.c | 25 +- test/Makefile | 9 +- test/cupti/matmul.cu | 370 ++++++++ test/cupti/matmul.h | 17 + test/cupti/matmul_marker.c | 27 + test/cupti/realtime.h | 74 ++ 21 files changed, 2297 insertions(+), 1172 deletions(-) create mode 100644 src/includes/nvmon_cupti.h create mode 100644 src/includes/nvmon_nvml.h create mode 100644 src/includes/nvmon_perfworks.h create mode 100644 test/cupti/matmul.cu create mode 100644 test/cupti/matmul.h create mode 100644 test/cupti/matmul_marker.c create mode 100644 test/cupti/realtime.h diff --git a/Makefile b/Makefile index f06293d79..6662b44f8 100644 --- a/Makefile +++ b/Makefile @@ -91,6 +91,8 @@ OBJ := $(filter-out $(BUILD_DIR)/loadDataARM.o,$(OBJ)) endif ifneq ($(NVIDIA_INTERFACE), true) OBJ := $(filter-out $(BUILD_DIR)/nvmon.o,$(OBJ)) +OBJ := $(filter-out $(BUILD_DIR)/topology_gpu.o,$(OBJ)) +OBJ := $(filter-out $(BUILD_DIR)/libnvctr.o,$(OBJ)) endif PERFMONHEADERS = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildcard $(SRC_DIR)/includes/*.txt)) OBJ_LUA = $(wildcard ./ext/lua/$(COMPILER)/*.o) diff --git a/config.mk b/config.mk index b25dfd847..82d2969a9 100644 --- a/config.mk +++ b/config.mk @@ -10,7 +10,7 @@ COMPILER = GCC#NO SPACE # Path were to install likwid -PREFIX ?= /usr/local#NO SPACE +PREFIX ?= /apps/likwid/4.3.4#NO SPACE # Set the default mode for MSR access. # This can usually be overriden on the commandline. @@ -26,6 +26,7 @@ FORTRAN_INTERFACE = false#NO SPACE INSTRUMENT_BENCH = true#NO SPACE # Build LIKWID with NVIDIA interface (CUDA, CUPTI) +# For configuring include paths, go to CUDA section NVIDIA_INTERFACE = true#NO SPACE ################################################################# @@ -85,7 +86,7 @@ INSTALL_CHOWN = -g root -o root#NO SPACE SHARED_LIBRARY = true#NO SPACE # Build LIKWID with debug flags -DEBUG = false#NO SPACE +DEBUG = true#NO SPACE # Basic configuration for some internal arrays. # Maximal number of hardware threads @@ -140,12 +141,11 @@ LIKWIDFILTERPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/filter) # $HOME/.likwid/groups LIKWIDGROUPPATH = $(abspath $(INSTALLED_PREFIX)/share/likwid/perfgroups) -# CUDA info +# CUDA / CUPTI build data +# LIKWID requires CUDA and CUPTI to be present only for compilation with +# NVIDIA_INTERFACE=true. At runtime, the CUDA and the CUPTI library have +# to be in the LD_LIBRARY_PATH to dynamically load the libraries. # Include directory for CUDA headers CUDAINCLUDE = $(CUDA_HOME)/include -# Library directory for CUDA libs -CUDALIBDIR = $(CUDA_HOME)/lib64 # Include directory for CUPTI headers CUPTIINCLUDE = $(CUDA_HOME)/extras/CUPTI/include -# Library directory for CUPTI libs -CUPTILIBDIR = $(CUDA_HOME)/extras/CUPTI/lib64 diff --git a/make/config_checks.mk b/make/config_checks.mk index d3be955af..d05d77b69 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -77,7 +77,7 @@ endif endif ifeq ($(strip $(NVIDIA_INTERFACE)), true) -LIBS+= -lcuda -ldl +#LIBS+= -lcuda -ldl INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE) -CPPFLAGS += -L$(CUDALIBDIR) -L$(CUPTILIBDIR) +#CPPFLAGS += -L$(CUDALIBDIR) -L$(CUPTILIBDIR) endif diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua index 6ca26908a..f5de7f097 100644 --- a/src/applications/likwid-perfctr.lua +++ b/src/applications/likwid-perfctr.lua @@ -49,13 +49,19 @@ local function examples() io.stdout:write("likwid-perfctr -e\n") io.stdout:write("List all events and suitable counters for events with 'L2' in them:\n") io.stdout:write("likwid-perfctr -E L2\n") - io.stdout:write("Run command on CPU 2 and measure performance group TEST:\n") - io.stdout:write("likwid-perfctr -C 2 -g TEST ./a.out\n") + io.stdout:write("Run command on CPU 2 and measure performance group CLOCK:\n") + io.stdout:write("likwid-perfctr -C 2 -g CLOCK ./a.out\n") + if likwid.gpuSupported() then + io.stdout:write("Run command and measure on GPU 1 the performance group FLOPS_DP (Only with NVMarkerAPI):\n") + io.stdout:write("likwid-perfctr -G 1 -W FLOPS_DP -m ./a.out\n") + io.stdout:write("It is possible to combine CPU and GPU measurements (with MarkerAPI and NVMarkerAPI):\n) + io.stdout:write("likwid-perfctr -C 2 -g CLOCK -G 1 -W FLOPS_DP -m ./a.out\n") + end end local function usage() version() - io.stdout:write("A tool to read out performance counter registers on x86 processors\n\n") + io.stdout:write("A tool to read out performance counter registers on x86, ARM and POWER processors\n\n") io.stdout:write("Options:\n") io.stdout:write("-h, --help\t\t Help message\n") io.stdout:write("-v, --version\t\t Version information\n") @@ -63,7 +69,13 @@ local function usage() io.stdout:write("-c \t\t Processor ids to measure (required), e.g. 1,2-4,8\n") io.stdout:write("-C \t\t Processor ids to pin threads and measure, e.g. 1,2-4,8\n") io.stdout:write("\t\t\t For information about the syntax, see likwid-pin\n") - io.stdout:write("-g, --group \t Performance group or custom event set string\n") + if likwid.gpuSupported() then + io.stdout:write("-G, --gpus \t List of GPUs to monitor\n") + end + io.stdout:write("-g, --group \t Performance group or custom event set string for CPU monitoring\n") + if likwid.gpuSupported() then + io.stdout:write("-W, --gpugroup \t Performance group or custom event set string for GPU monitoring\n") + end io.stdout:write("-H\t\t\t Get group help (together with -g switch)\n") io.stdout:write("-s, --skip \t Bitmask with threads to skip\n") io.stdout:write("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon\n") @@ -73,8 +85,6 @@ local function usage() io.stdout:write("-i, --info\t\t Print CPU info\n") io.stdout:write("-T