diff --git a/Makefile b/Makefile index e133bdb7e..e7fed144e 100644 --- a/Makefile +++ b/Makefile @@ -122,9 +122,15 @@ OBJ := $(filter-out $(BUILD_DIR)/loadDataARM.o,$(OBJ)) endif ifneq ($(NVIDIA_INTERFACE), true) OBJ := $(filter-out $(BUILD_DIR)/nvmon.o,$(OBJ)) +OBJ := $(filter-out $(BUILD_DIR)/nvmon_nvml.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/topology_gpu.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/libnvctr.o,$(OBJ)) endif +ifneq ($(ROCM_INTERFACE), true) +OBJ := $(filter-out $(BUILD_DIR)/rocmon.o,$(OBJ)) +OBJ := $(filter-out $(BUILD_DIR)/rocmon-marker.o,$(OBJ)) +OBJ := $(filter-out $(BUILD_DIR)/topology_gpu_rocm.o,$(OBJ)) +endif ifeq ($(COMPILER),GCCPOWER) OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ)) OBJ := $(filter-out $(BUILD_DIR)/access_x86.o,$(OBJ)) @@ -195,6 +201,7 @@ $(L_APPS): $(addprefix $(SRC_DIR)/applications/,$(addsuffix .lua,$(L_APPS))) @echo "===> ADJUSTING $@" @if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 1"/"access_mode = 0"/g $(SRC_DIR)/applications/$@.lua;fi @sed -e s/''/$(subst /,\\/,$(INSTALLED_BINPREFIX))/g \ + -e s/''/$(subst /,\\/,$(INSTALLED_LIBPREFIX))/g \ -e s/''/$(subst /,\\/,$(INSTALLED_PREFIX))/g \ -e s/''/$(VERSION).$(RELEASE).$(MINOR)/g \ -e s/''/$(DATE)/g \ @@ -236,6 +243,7 @@ $(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_HWLOC_LIB) @ln -sf $(TARGET_LIB) $(TARGET_LIB).$(VERSION).$(RELEASE) @sed -e s+'@PREFIX@'+$(INSTALLED_PREFIX)+g \ -e s+'@NVIDIA_INTERFACE@'+$(NVIDIA_INTERFACE)+g \ + -e s+'@ROCM_INTERFACE@'+$(ROCM_INTERFACE)+g \ -e s+'@FORTRAN_INTERFACE@'+$(FORTRAN_INTERFACE)+g \ -e s+'@LIBPREFIX@'+$(INSTALLED_LIBPREFIX)+g \ -e s+'@BINPREFIX@'+$(INSTALLED_BINPREFIX)+g \ @@ -303,6 +311,11 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ $(Q)$(CC) $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d +$(BUILD_DIR)/rocmon_marker.o: rocmon_marker.c + @echo "===> COMPILE $@" + $(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@ + $(Q)objcopy --redefine-sym HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE=HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE2 $@ + $(BUILD_DIR)/%.o: %.cc @echo "===> COMPILE $@" $(Q)$(CXX) -c $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ diff --git a/README_ROCM.md b/README_ROCM.md new file mode 100644 index 000000000..3553f841e --- /dev/null +++ b/README_ROCM.md @@ -0,0 +1,28 @@ +## Build & Install + +```bash +export ROCM_HOME=/opt/rocm +make +make install +``` + +## Test + +Build + +```bash +cd test +# make clean +make test-topology-gpu-rocm +make test-rocmon-triad +make test-rocmon-triad-marker +``` + +Run + +```bash +export LD_LIBRARY_PATH=/home/users/kraljic/likwid-rocmon/install/lib:/opt/rocm/hip/lib:/opt/rocm/hsa/lib:/opt/rocm/rocprofiler/lib:$LD_LIBRARY_PATH +export ROCP_METRICS=/opt/rocm/rocprofiler/lib/metrics.xml # for rocmon test +export HSA_TOOLS_LIB=librocprofiler64.so.1 # allows rocmon to intercept hsa commands +./gpu-test-topology-gpu-rocm +``` diff --git a/config.mk b/config.mk index 32633a9e1..dd3be85a9 100644 --- a/config.mk +++ b/config.mk @@ -30,6 +30,10 @@ INSTRUMENT_BENCH = true#NO SPACE # For configuring include paths, go to CUDA section NVIDIA_INTERFACE = false#NO SPACE +# Build LIKWID with AMD GPU interface (ROCm) +# For configuring include paths, go to ROCm section +ROCM_INTERFACE = false#NO SPACE + ################################################################# ################################################################# # Advanced configuration options # @@ -172,3 +176,13 @@ CUPTIINCLUDE = $(CUDA_HOME)/extras/CUPTI/include # In order to hook into the CUDA application, the appDaemon is required # If you just want the NvMarkerAPI, you can keep it false BUILDAPPDAEMON=false + +# ROCm build data +# LIKWID requires ROCm to be present only for compilation with +# ROCM_INTERFACE=true. At runtime, the ROCm library have +# to be in the LD_LIBRARY_PATH to dynamically load the libraries. +# Include directory for ROCm headers +HSAINCLUDE = $(ROCM_HOME)/include +ROCPROFILERINCLUDE = $(ROCM_HOME)/include/rocprofiler +HIPINCLUDE = $(ROCM_HOME)/include +RSMIINCLUDE = $(ROCM_HOME)/include diff --git a/doc/applications/likwid-perfctr.md b/doc/applications/likwid-perfctr.md index 25077c39a..3f3cdd244 100644 --- a/doc/applications/likwid-perfctr.md +++ b/doc/applications/likwid-perfctr.md @@ -56,7 +56,11 @@ custom event sets. The \ref Marker_API can measure mulitple named regions and th -W, --gpugroup <arg> - Specify which event string or performance group should be measured on the GPUs. Only if built with NVIDIA_INTERFACE=true. + Specify which event string or performance group should be measured on the Nvidia GPUs. Only if built with NVIDIA_INTERFACE=true. + + + -R <arg> + Specify which event string or performance group should be measured on the AMD GPUs. Only if built with ROCM_INTERFACE=true. -c <arg> @@ -68,7 +72,11 @@ custom event sets. The \ref Marker_API can measure mulitple named regions and th -G <arg> - Defines the GPUs that should be measured
You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with NVIDIA_INTERFACE=true. + Defines the Nvidia GPUs that should be measured
You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with NVIDIA_INTERFACE=true. + + + -I <arg> + Defines the AMD GPUs that should be measured
You can use simple lists like 0,1,3 or ranges like 0-2. Only if built with ROCM_INTERFACE=true. -H @@ -274,6 +282,8 @@ The LIKWID package contains an example code: see \ref F-markerAPI-code. Since the calls to the LIKWID library are executed by your application, the runtime will raise and in specific circumstances, there are some other problems like the time measurement. You can execute LIKWID_MARKER_THREADINIT and LIKWID_MARKER_START inside the same parallel region but put a barrier between the calls to ensure that there is no big timing difference between the threads. The common way is to init LIKWID and the participating threads inside of an initialization routine, use only START and STOP in your code and close the Marker API in a finalization routine. Be aware that at the first start of a region, the thread-local hash table gets a new entry to store the measured values. If your code inside the region is short or you are executing the region only once, the overhead of creating the hash table entry can be significant compared to the execution of the region code. The overhead of creating the hash tables can be done in prior by using the LIKWID_MARKER_REGISTER function. It must be called by each thread and one time for each compute region. It is completely optional, LIKWID_MARKER_START performs the same operations.

CUDA code

-With LIKWID 5.0 CUDA kernels can be measured. There is a special NvMarkerAPI for Nvidia GPUs. The usage is similar to the CPU MarkerAPI, just replace LIKWID_MARKER_ with LIKWID_NVMARKER_. The two MarkerAPIs can be mixed. +With LIKWID 5.0 CUDA kernels can be measured. There is a special NvMarkerAPI for Nvidia GPUs. The usage is similar to the CPU MarkerAPI, just replace LIKWID_MARKER_ with LIKWID_NVMARKER_. All MarkerAPIs can be mixed. +

ROCm code

+ROCm kernels can be measured. There is a special RocmonMarkerAPI for AMD GPUs. The usage is similar to the CPU or Nvidia MarkerAPI, just replace LIKWID_MARKER_ with ROCMON_MARKER_. All MarkerAPIs can be mixed. */ diff --git a/doc/likwid-doxygen.md b/doc/likwid-doxygen.md index b7788df66..2a4f96305 100644 --- a/doc/likwid-doxygen.md +++ b/doc/likwid-doxygen.md @@ -1,7 +1,7 @@ /*! \mainpage LIKWID - Like I Knew What I Am Doing \section Introduction -This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID was on x86 processors, it is now ported to ARM and POWER processors. A backend for Nvidia GPUs is part of LIKWID with version 5.0.
+This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID was on x86 processors, it is now ported to ARM and POWER processors. A backend for Nvidia GPUs is part of LIKWID with version 5.0. With the Rocmon backend, AMD GPUs can be monitored.
LIKWID follows the philosophy: - Simple @@ -16,7 +16,7 @@ LIKWID follows the philosophy: \section Tools LIKWID Tools - \ref likwid-topology : A tool to display the thread and cache topology on multicore/multisocket computers. - \ref likwid-pin : A tool to pin your threaded application without changing your code. Works for pthreads and OpenMP. -- \ref likwid-perfctr : A tool to measure hardware performance counters on x86, ARM and POWER processors as well as Nvidia GPUs. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code. +- \ref likwid-perfctr : A tool to measure hardware performance counters on x86, ARM and POWER processors as well as Nvidia/AMD GPUs. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code. - \ref likwid-powermeter : A tool for accessing RAPL counters and query Turbo mode steps on Intel processor. RAPL counters are also available in \ref likwid-perfctr. - \ref likwid-setFrequencies : A tool to print and manage the clock frequency of CPU hardware threads and the Uncore (Intel only). - \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks. @@ -133,6 +133,9 @@ Optionally, a global configuration file \ref likwid.cfg can be given to modify s - For compute capability < 7.0: support based on CUPTI Events API - For compute capability >= 7.0: support based on CUpti Profiling API +\subsection Architectures_AMD AMD GPU architectures +- ROCm 5.0 and higher capable GPUs + \section Examples Example Codes Using the Likwid API: - \ref C-likwidAPI-code diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1 index c5343322f..3312d6d74 100644 --- a/doc/likwid-perfctr.1 +++ b/doc/likwid-perfctr.1 @@ -1,6 +1,6 @@ .TH LIKWID-PERFCTR 1 likwid\- .SH NAME -likwid-perfctr \- configure and read out hardware performance counters on x86, ARM and POWER CPUs and Nvidia GPUs +likwid-perfctr \- configure and read out hardware performance counters on x86, ARM and POWER CPUs and Nvidia/AMD GPUs .SH SYNOPSIS .B likwid-perfctr .RB [\-vhHmaiefO] @@ -34,6 +34,12 @@ or .IR gpu_performance_group or .IR gpu_performance_event_string (*) ] +.RB [ \-I +.IR gpu_list (**) ] +.RB [ \-R +.IR gpu_performance_group +or +.IR gpu_performance_event_string (**) ] .RB [ \-\-stats ] .SH DESCRIPTION .B likwid-perfctr @@ -44,6 +50,7 @@ There are preconfigured performance groups with useful event sets and derived me events can be measured with custom event sets. The marker API can measure mulitple named regions and the results are accumulated over multiple region calls. .IR (*) Option only available if built with Nvidia GPU support +.IR (**) Option only available if built with AMD GPU support .SH OPTIONS .TP @@ -66,7 +73,7 @@ run in marker API mode print available performance groups for current processor, then exit. .TP .B \-\^e -print available counters and performance events of current processor and (if available) Nvidia GPUs. +print available counters and performance events of current processor and (if available) Nvidia or AMD GPUs. .TP .B \-\^o, \-\-\^output store all ouput to a file instead of stdout. For the filename the following placeholders are supported: @@ -116,7 +123,7 @@ Force writing of registers even if they are in use. Print only events and corresponding counters matching .TP .B \-\^G, \-\-\^gpus -specify a numerical list of GPU IDs. The list may contain multiple +specify a numerical list of Nvidia GPU IDs. The list may contain multiple items, separated by comma, and ranges. For example 0,3,9-11. .TP .B \-\^W, \-\-\^gpugroup or @@ -125,6 +132,16 @@ This can be one of the tags output with the -a flag in the GPU section. Also a custom event set can be specified by a comma separated list of events. Each event has the format eventId:GPUx (x=0,1,2,...). You can add as many events to the string until you hit an error. .TP +.B \-\^I, \-\-\^gpus +specify a numerical list of AMD GPU IDs. The list may contain multiple +items, separated by comma, and ranges. For example 0,3,9-11. +.TP +.B \-\^R, \-\-\^gpugroup or +specify which performance group to measure on the specified AMD GPUs. +This can be one of the tags output with the -a flag in the GPU section. +Also a custom event set can be specified by a comma separated list of events. Each event has the format +eventId:GPUx (x=0,1,2,...). You can add as many events to the string until you hit an error. +.TP .B \-\-\^stats Always print statistics table diff --git a/doc/likwid-topology.1 b/doc/likwid-topology.1 index 8ae22b6e2..b804a9bb2 100644 --- a/doc/likwid-topology.1 +++ b/doc/likwid-topology.1 @@ -1,6 +1,6 @@ .TH LIKWID-TOPOLOGY 1 likwid\- .SH NAME -likwid-topology \- print thread, cache, NUMA and Nvidia GPU topology +likwid-topology \- print thread, cache, NUMA and Nvidia/AMD GPU topology .SH SYNOPSIS .B likwid-topology .RB [\-hvgcCG] @@ -11,12 +11,12 @@ likwid-topology \- print thread, cache, NUMA and Nvidia GPU topology .SH DESCRIPTION .B likwid-topology is a command line application to print the thread and cache -topology on multicore x86, ARM and POWER processors and Nvidia GPUs. +topology on multicore x86, ARM and POWER processors and Nvidia/AMD GPUs. Used with mono spaced fonts it can draw the processor topology of a machine in ASCII art. Beyond topology likwid-topology determines the clock of a processor and prints detailed informations about the caches hierarchy. When compiled with NVIDIA_INTERFACE=true in config.mk and the CUDA/CUPTI library reachable -at runtime, likwid-topology prints information about the Nvidia GPUs in the system. +at runtime, likwid-topology prints information about the Nvidia GPUs in the system. The same is possible for AMD GPUs with ROCM_INTERFACE=TRUE and the required ROCm libraries. .SH OPTIONS .TP .B \-h, \-\-\^help @@ -38,7 +38,7 @@ prints detailed information about cache hierarchy measures and output the processor clock. This involves a longer run time of likwid-topology. .TP .B \-G, \-\-\^gpus -prints detailed information about the Nvidia GPUs in the system (if compiled with Nvidia support) +prints detailed information about the Nvidia/AMD GPUs in the system (if compiled with Nvidia or AMD support) .TP .B \-o, \-\-\^output write the output to file instead of stdout. diff --git a/groups/amd_gpu/GDS.txt b/groups/amd_gpu/GDS.txt new file mode 100644 index 000000000..f29639357 --- /dev/null +++ b/groups/amd_gpu/GDS.txt @@ -0,0 +1,13 @@ +SHORT GDS Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_GDS +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU GDS rw insts per work-item ROCM0/ROCM1 + +LONG +-- +The average number of GDS read or GDS write instructions executed +per work item (affected by flow control). diff --git a/groups/amd_gpu/MEM.txt b/groups/amd_gpu/MEM.txt new file mode 100644 index 000000000..d5e6c5350 --- /dev/null +++ b/groups/amd_gpu/MEM.txt @@ -0,0 +1,16 @@ +SHORT Memory utilization + +EVENTSET +ROCM0 ROCP_TA_TA_BUSY +ROCM1 ROCP_GRBM_GUI_ACTIVE +ROCM2 ROCP_SE_NUM + +METRICS +GPU memory utilization 100*max(ROCM0,16)/ROCM1/ROCM2 + +LONG +-- +The percentage of GPUTime the memory unit is active. The result includes +the stall time (MemUnitStalled). This is measured with all extra fetches +and writes and any cache or memory effects taken into account. +Value range: 0% to 100% (fetch-bound). diff --git a/groups/amd_gpu/PCI.txt b/groups/amd_gpu/PCI.txt new file mode 100644 index 000000000..201f4ff89 --- /dev/null +++ b/groups/amd_gpu/PCI.txt @@ -0,0 +1,18 @@ +SHORT PCI Transfers + +EVENTSET +ROCM0 RSMI_PCI_THROUGHPUT_SENT +ROCM1 RSMI_PCI_THROUGHPUT_RECEIVED + + +METRICS +Runtime time +PCI sent ROCM0 +PCI received ROCM1 +PCI send bandwidth 1E-6*ROCM0/time +PCI recv bandwidth 1E-6*ROCM1/time + +LONG +-- +Currently not usable since the RSMI_PCI_THROUGHPUT_* events require +one second per call, so 2 seconds for both of them. diff --git a/groups/amd_gpu/POWER.txt b/groups/amd_gpu/POWER.txt new file mode 100644 index 000000000..e4ee0a7bb --- /dev/null +++ b/groups/amd_gpu/POWER.txt @@ -0,0 +1,17 @@ +SHORT Power, temperature and voltage + +EVENTSET +ROCM0 RSMI_POWER_AVE[0] +ROCM1 RSMI_TEMP_EDGE +ROCM2 RSMI_VOLT_VDDGFX + + +METRICS +Power average 1E-6*ROCM0 +Edge temperature 1E-3*ROCM1 +Voltage 1E-3*ROCM2 + +LONG +-- +Gets the current average power consumption in watts, the +temperature in celsius and the voltage in volts. diff --git a/groups/amd_gpu/SALU.txt b/groups/amd_gpu/SALU.txt new file mode 100644 index 000000000..b5259d793 --- /dev/null +++ b/groups/amd_gpu/SALU.txt @@ -0,0 +1,13 @@ +SHORT SALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SALU insts per work-item ROCM0/ROCM1 + +LONG +-- +The average number of scalar ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu/SFETCH.txt b/groups/amd_gpu/SFETCH.txt new file mode 100644 index 000000000..e33930eba --- /dev/null +++ b/groups/amd_gpu/SFETCH.txt @@ -0,0 +1,13 @@ +SHORT SFetch Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_SMEM +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU SFETCH insts per work-item ROCM0/ROCM1 + +LONG +-- +The average number of scalar fetch instructions from the video memory +executed per work-item (affected by flow control). diff --git a/groups/amd_gpu/STALLED.txt b/groups/amd_gpu/STALLED.txt new file mode 100644 index 000000000..bc6086022 --- /dev/null +++ b/groups/amd_gpu/STALLED.txt @@ -0,0 +1,17 @@ +SHORT ALU stalled by LDS + +EVENTSET +ROCM0 ROCP_SQ_WAIT_INST_LDS +ROCM1 ROCP_SQ_WAVES +ROCM2 ROCP_GRBM_GUI_ACTIVE + +METRICS +GPU ALD stalled 100*ROCM0*4/ROCM1/ROCM2 + +LONG +-- +The percentage of GPUTime ALU units are stalled by the LDS input queue +being full or the output queue being not ready. If there are LDS bank +conflicts, reduce them. Otherwise, try reducing the number of LDS +accesses if possible. +Value range: 0% (optimal) to 100% (bad). diff --git a/groups/amd_gpu/UTIL.txt b/groups/amd_gpu/UTIL.txt new file mode 100644 index 000000000..e831e3c16 --- /dev/null +++ b/groups/amd_gpu/UTIL.txt @@ -0,0 +1,16 @@ +SHORT GPU utilization + +EVENTSET +ROCM0 ROCP_GRBM_COUNT +ROCM1 ROCP_GRBM_GUI_ACTIVE + + +METRICS +GPU utilization 100*ROCM1/ROCM0 + + +LONG +-- +This group reassembles the 'GPUBusy' metric provided by RocProfiler. +We should add, that we can select the GPUBusy metric directly and the +calculations are done internally in case the metric formula changes. diff --git a/groups/amd_gpu/VALU.txt b/groups/amd_gpu/VALU.txt new file mode 100644 index 000000000..e26a3b690 --- /dev/null +++ b/groups/amd_gpu/VALU.txt @@ -0,0 +1,13 @@ +SHORT VALU Instructions + +EVENTSET +ROCM0 ROCP_SQ_INSTS_VALU +ROCM1 ROCP_SQ_WAVES + +METRICS +GPU VALU insts per work-item ROCM0/ROCM1 + +LONG +-- +The average number of vector ALU instructions executed per work-item +(affected by flow control). diff --git a/groups/amd_gpu/WAVE.txt b/groups/amd_gpu/WAVE.txt new file mode 100644 index 000000000..eb9aec9fe --- /dev/null +++ b/groups/amd_gpu/WAVE.txt @@ -0,0 +1,13 @@ +SHORT Wavefronts + +EVENTSET +ROCM0 ROCP_SQ_WAVES + + +METRICS +GPU wavefronts ROCM0 + + +LONG +-- +Total Wavefronts diff --git a/make/config_checks.mk b/make/config_checks.mk index 4d23b3607..214a83e5c 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -82,3 +82,8 @@ ifeq ($(strip $(NVIDIA_INTERFACE)), true) INCLUDES += -I$(CUDAINCLUDE) -I$(CUPTIINCLUDE) #CPPFLAGS += -L$(CUDALIBDIR) -L$(CUPTILIBDIR) endif + +ifeq ($(strip $(ROCM_INTERFACE)), true) +# HSA includes 'hsa/xxx.h' and rocprofiler 'xxx.h' +INCLUDES += -I$(HIPINCLUDE) -I$(HSAINCLUDE) -I$(HSAINCLUDE)/hsa -I$(ROCPROFILERINCLUDE) -I$(RSMIINCLUDE) +endif diff --git a/make/config_defines.mk b/make/config_defines.mk index 990185e1f..92c4b9e3b 100644 --- a/make/config_defines.mk +++ b/make/config_defines.mk @@ -294,8 +294,10 @@ endif ifeq ($(strip $(NVIDIA_INTERFACE)),true) DEFINES += -DLIKWID_WITH_NVMON -else -BUILDAPPDAEMON := false +endif + +ifeq ($(strip $(ROCM_INTERFACE)),true) +DEFINES += -DLIKWID_WITH_ROCMON -D__HIP_PLATFORM_HCC__ endif ifeq ($(strip $(BUILDDAEMON)),true) diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile index 8e272d09f..ecd500c1a 100644 --- a/src/access-daemon/Makefile +++ b/src/access-daemon/Makefile @@ -39,12 +39,18 @@ DEFINES += -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -DMAX_NUM_NODES= ifeq ($(DEBUG),true) DEFINES += -DDEBUG_LIKWID endif +ifeq ($(NVIDIA_INTERFACE), true) +DEFINES += -DLIKWID_NVMON +endif +ifeq ($(ROCM_INTERFACE), true) +DEFINES += -DLIKWID_ROCMON +endif INCLUDES = -I../includes CFLAGS += -std=c99 -fPIC -pie -fPIE -fstack-protector ifeq ($(COMPILER),GCCX86) CFLAGS += -m32 endif -CPPFLAGS := $(DEFINES) $(INCLUDES) +CPPFLAGS := $(DEFINES) $(INCLUDES) -L$(PREFIX)/lib ifeq ($(COMPILER),GCCARMv8) all: @@ -59,4 +65,4 @@ $(SETFREQ_TARGET): setFreqDaemon.c $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreqDaemon.c $(APPDAEMON_TARGET): $(GOTCHA_TARGET) appDaemon.c - $(Q)$(CC) -shared -fPIC $(CPPFLAGS) -Wl,-soname,$(APPDAEMON_TARGET).$(VERSION).$(RELEASE) -fstack-protector -I. -I$(GOTCHA_FOLDER)/include -L$(GOTCHA_FOLDER) appDaemon.c -o ../../$(APPDAEMON_TARGET) -llikwid-gotcha + $(Q)$(CC) -shared -fPIC $(CPPFLAGS) -Wl,-soname,$(APPDAEMON_TARGET).$(VERSION).$(RELEASE) -fstack-protector -I. ../bstrlib.c appDaemon.c -o ../../$(APPDAEMON_TARGET) -llikwid -L../../ diff --git a/src/access-daemon/appDaemon.c b/src/access-daemon/appDaemon.c index cbf33cc43..1f82f6757 100644 --- a/src/access-daemon/appDaemon.c +++ b/src/access-daemon/appDaemon.c @@ -32,48 +32,581 @@ #include #include -#include +#include +#include +#include +#include +#include +#include -gotcha_wrappee_handle_t orig_main_handle; +#include +#include -static int appDaemon_initialized = 0; +typedef void(*appdaemon_exit_func)(void); +#define APPDAEMON_MAX_EXIT_FUNCS 2 +static appdaemon_exit_func appdaemon_exit_funcs[APPDAEMON_MAX_EXIT_FUNCS]; +static int appdaemon_num_exit_funcs = 0; -int likwid_appDaemon_main(int argc, char** argv) +static struct tagbstring daemon_name = bsStatic("likwid-appDaemon.so"); +static FILE* output_file = NULL; + +// Timeline mode +static int stopIssued = 0; +static pthread_mutex_t stopMutex; + +int appdaemon_register_exit(appdaemon_exit_func f) { - int return_code = 0; - typeof(&likwid_appDaemon_main) orig_main = (int (*)(int, char**))gotcha_get_wrappee(orig_main_handle); - char* nvEventStr = getenv("NVMON_EVENTS"); - char* nvGpuStr = getenv("NVMON_GPUS"); + if (appdaemon_num_exit_funcs < APPDAEMON_MAX_EXIT_FUNCS) + { + appdaemon_exit_funcs[appdaemon_num_exit_funcs] = f; + appdaemon_num_exit_funcs++; + } +} - if (appDaemon_initialized) +static void after_main() +{ + // Stop timeline thread (if running) + pthread_mutex_lock(&stopMutex); + stopIssued = 1; + pthread_mutex_unlock(&stopMutex); + + for (int i = 0; i < appdaemon_num_exit_funcs; i++) { - return_code = orig_main(argc, argv); + appdaemon_exit_funcs[i](); } - else + + if (output_file) { + fclose(output_file); + } +} - appDaemon_initialized = 1; +static void prepare_ldpreload() +{ + int (*mysetenv)(const char *name, const char *value, int overwrite) = setenv; + char* ldpreload = getenv("LD_PRELOAD"); + if (ldpreload) + { + printf("Old LD_PRELOAD=%s\n", ldpreload); + bstring bldpre = bfromcstr(ldpreload); + bstring new_bldpre = bfromcstr(""); + struct bstrList *liblist = bsplit(bldpre, ':'); + for (int i = 0; i < liblist->qty; i++) + { + if (binstr(liblist->entry[i], 0, &daemon_name) == BSTR_ERR) + { + bconcat(new_bldpre, liblist->entry[i]); + bconchar(new_bldpre, ':'); + } + } + printf("New LD_PRELOAD=%s\n", bdata(new_bldpre)); + mysetenv("LD_PRELOAD", bdata(new_bldpre), 1); + bstrListDestroy(liblist); + bdestroy(new_bldpre); + bdestroy(bldpre); + } +} +static int parse_gpustr(char* gpuStr, int* numGpus, int** gpuIds) +{ + // Create bstring + bstring bGpuStr = bfromcstr(gpuStr); + + // Parse list + struct bstrList* gpuTokens = bsplit(bGpuStr,','); + int tmpNumGpus = gpuTokens->qty; - return_code = orig_main(argc, argv); + // Allocate gpuId list + int* tmpGpuIds = malloc(tmpNumGpus * sizeof(int)); + if (!tmpGpuIds) + { + fprintf(stderr,"Cannot allocate space for GPU list.\n"); + bdestroy(bGpuStr); + bstrListDestroy(gpuTokens); + return -EXIT_FAILURE; } + // Parse ids to int + for (int i = 0; i < tmpNumGpus; i++) + { + tmpGpuIds[i] = atoi(bdata(gpuTokens->entry[i])); + } + // Copy data + *numGpus = tmpNumGpus; + *gpuIds = tmpGpuIds; + // Destroy bstring + bdestroy(bGpuStr); + bstrListDestroy(gpuTokens); + return 0; +} +/* +Nvmon +*/ +#ifdef LIKWID_NVMON +static int nvmon_initialized = 0; +static int* nvmon_gpulist = NULL; +static int nvmon_numgpus = 0; +static int* nvmon_gids = NULL; +static int nvmon_numgids = 0; - appDaemon_initialized = 0; - return return_code; +static int appdaemon_setup_nvmon(char* gpuStr, char* eventStr) +{ + int ret = 0; + printf("Nvmon GPU string: %s\n", gpuStr); + printf("Nvmon Event string: %s\n", eventStr); + + // Parse gpu string + ret = parse_gpustr(gpuStr, &nvmon_numgpus, &nvmon_gpulist); + if (ret < 0) + { + ERROR_PRINT(Failed to get nvmon gpulist from '%s', gpuStr); + goto appdaemon_setup_nvmon_cleanup; + } + + // Parse event string + bstring bev = bfromcstr(eventStr); + struct bstrList* nvmon_eventlist = bsplit(bev, '|'); + bdestroy(bev); + nvmon_gids = malloc(nvmon_eventlist->qty * sizeof(int)); + if (!nvmon_gids) + { + ERROR_PRINT(Failed to allocate space for nvmon group IDs); + goto appdaemon_setup_nvmon_cleanup; + } + + // Init nvmon + ret = nvmon_init(nvmon_numgpus, nvmon_gpulist); + if (ret < 0) + { + ERROR_PRINT(Failed to initialize nvmon); + goto appdaemon_setup_nvmon_cleanup; + } + nvmon_initialized = 1; + + // Add event sets + for (int i = 0; i < nvmon_eventlist->qty; i++) + { + ret = nvmon_addEventSet(bdata(nvmon_eventlist->entry[i])); + if (ret < 0) + { + ERROR_PRINT(Failed to add nvmon group: %s, bdata(nvmon_eventlist->entry[i])); + continue; + } + nvmon_gids[nvmon_numgids++] = ret; + } + if (nvmon_numgids == 0) + { + ERROR_PRINT(Failed to add any events to nvmon); + goto appdaemon_setup_nvmon_cleanup; + } + + // Setup counters + ret = nvmon_setupCounters(nvmon_gids[0]); + if (ret < 0) + { + ERROR_PRINT(Failed to setup nvmon); + goto appdaemon_setup_nvmon_cleanup; + } + + // Start counters + ret = nvmon_startCounters(); + if (ret < 0) + { + ERROR_PRINT(Failed to start nvmon); + goto appdaemon_setup_nvmon_cleanup; + } + return 0; +appdaemon_setup_nvmon_cleanup: + if (nvmon_initialized) + { + nvmon_finalize(); + nvmon_initialized = 0; + } + if (nvmon_gids) + { + free(nvmon_gids); + nvmon_gids = NULL; + nvmon_numgids = 0; + } + if (nvmon_eventlist) + { + bstrListDestroy(nvmon_eventlist); + nvmon_eventlist = NULL; + } + if (nvmon_gpulist) + { + free(nvmon_gpulist); + nvmon_gpulist = NULL; + nvmon_numgpus = 0; + } + return ret; } +static void appdaemon_close_nvmon(void) +{ + // Stop counters + int ret = nvmon_stopCounters(); + if (ret < 0) + { + ERROR_PRINT(Failed to stop nvmon); + } -struct gotcha_binding_t likwid_appDaemon_overwrites[] = { - {"main", likwid_appDaemon_main, (void*)&orig_main_handle}, -}; + // Print results + for (int g = 0; g < nvmon_numgids; g++) + { + int gid = nvmon_gids[g]; + for (int i = 0; i < nvmon_getNumberOfEvents(gid); i++) + { + for (int j = 0; j < nvmon_numgpus; j++) + { + fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeOfGroup(nvmon_gpulist[j]), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j)); + } + } + } + fflush(output_file); + // Cleanup + if (nvmon_initialized) + { + nvmon_finalize(); + nvmon_initialized = 0; + } + if (nvmon_gids) + { + free(nvmon_gids); + nvmon_gids = NULL; + nvmon_numgids = 0; + } + if (nvmon_gpulist) + { + free(nvmon_gpulist); + nvmon_gpulist = NULL; + nvmon_numgpus = 0; + } +} -void __attribute__((constructor)) likwid_appDaemon_constructor() +static void appdaemon_read_nvmon(void) { - gotcha_wrap(likwid_appDaemon_overwrites, 1 ,"likwid_appDaemon"); + // Read counters + int ret = nvmon_readCounters(); + if (ret < 0) + { + fprintf(stderr, "Failed to read Nvmon counters\n"); + return; + } + + // Print results + for (int g = 0; g < nvmon_numgids; g++) + { + int gid = nvmon_gids[g]; + for (int i = 0; i < nvmon_getNumberOfEvents(gid); i++) + { + for (int j = 0; j < nvmon_numgpus; j++) + { + fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeToLastReadOfGroup(nvmon_gpulist[j]), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j)); + } + } + } } +#endif + +/* +Rocmon +*/ +#ifdef LIKWID_ROCMON +static int rocmon_initialized = 0; +static int* rocmon_gpulist = NULL; +static int rocmon_numgpus = 0; +static int* rocmon_gids = NULL; +static int rocmon_numgids = 0; + +static int appdaemon_setup_rocmon(char* gpuStr, char* eventStr) +{ + int ret = 0; + printf("Rocmon GPU string: %s\n", gpuStr); + printf("Rocmon Event string: %s\n", eventStr); + + // Parse gpu string + ret = parse_gpustr(gpuStr, &rocmon_numgpus, &rocmon_gpulist); + if (ret < 0) + { + ERROR_PRINT(Failed to get rocmon gpulist from '%s', gpuStr); + goto appdaemon_setup_rocmon_cleanup; + } + + // Parse event string + bstring bev = bfromcstr(eventStr); + struct bstrList* rocmon_eventlist = bsplit(bev, '|'); // TODO: multiple event sets not supported + bdestroy(bev); + rocmon_gids = malloc(rocmon_eventlist->qty * sizeof(int)); + if (!rocmon_gids) + { + ERROR_PRINT(Failed to allocate space for rocmon group IDs); + goto appdaemon_setup_rocmon_cleanup; + } + + // Init rocmon + ret = rocmon_init(rocmon_numgpus, rocmon_gpulist); + if (ret < 0) + { + ERROR_PRINT(Failed to initialize rocmon); + goto appdaemon_setup_rocmon_cleanup; + } + rocmon_initialized = 1; + + // Add event sets + for (int i = 0; i < rocmon_eventlist->qty; i++) + { + ret = rocmon_addEventSet(bdata(rocmon_eventlist->entry[i]), &rocmon_gids[rocmon_numgids++]); + if (ret < 0) + { + ERROR_PRINT(Failed to add rocmon group: %s, bdata(rocmon_eventlist->entry[i])); + } + } + if (rocmon_numgids == 0) + { + ERROR_PRINT(Failed to add any events to rocmon); + goto appdaemon_setup_rocmon_cleanup; + } + + // Setup counters + ret = rocmon_setupCounters(rocmon_gids[0]); + if (ret < 0) + { + ERROR_PRINT(Failed to setup rocmon); + goto appdaemon_setup_rocmon_cleanup; + } + + // Start counters + ret = rocmon_startCounters(); + if (ret < 0) + { + ERROR_PRINT(Failed to start rocmon); + goto appdaemon_setup_rocmon_cleanup; + } + return 0; +appdaemon_setup_rocmon_cleanup: + if (rocmon_initialized) + { + rocmon_finalize(); + rocmon_initialized = 0; + } + if (rocmon_gids) + { + free(rocmon_gids); + rocmon_gids = NULL; + rocmon_numgids = 0; + } + if (rocmon_eventlist) + { + bstrListDestroy(rocmon_eventlist); + rocmon_eventlist = NULL; + } + if (rocmon_gpulist) + { + free(rocmon_gpulist); + rocmon_gpulist = NULL; + rocmon_numgpus = 0; + } + return ret; +} + +static void appdaemon_close_rocmon(void) +{ + // Stop counters + int ret = rocmon_stopCounters(); + if (ret < 0) + { + ERROR_PRINT(Failed to stop rocmon); + } + + // Print results + for (int g = 0; g < rocmon_numgids; g++) + { + int gid = rocmon_gids[g]; + for (int i = 0; i < rocmon_getNumberOfEvents(gid); i++) + { + for (int j = 0; j < rocmon_numgpus; j++) + { + fprintf(output_file, "Rocmon, %d, %f, %s, %f, %f\n", rocmon_gpulist[j], rocmon_getTimeOfGroup(rocmon_gpulist[j]), rocmon_getEventName(gid, i), rocmon_getResult(j, gid, i), rocmon_getLastResult(j, gid, i)); + } + } + } + + // Cleanup + if (rocmon_initialized) + { + rocmon_finalize(); + rocmon_initialized = 0; + } + if (rocmon_gids) + { + free(rocmon_gids); + rocmon_gids = NULL; + rocmon_numgids = 0; + } + if (rocmon_gpulist) + { + free(rocmon_gpulist); + rocmon_gpulist = NULL; + rocmon_numgpus = 0; + } +} + +static void appdaemon_read_rocmon(void) +{ + // Read counters + int ret = rocmon_readCounters(); + if (ret < 0) + { + fprintf(stderr, "Failed to read Rocmon counters\n"); + return; + } + + // Print results + for (int g = 0; g < rocmon_numgids; g++) + { + int gid = rocmon_gids[g]; + for (int i = 0; i < rocmon_getNumberOfEvents(gid); i++) + { + for (int j = 0; j < rocmon_numgpus; j++) + { + fprintf(output_file, "Rocmon, %d, %f, %s, %f, %f\n", rocmon_gpulist[j], rocmon_getTimeToLastReadOfGroup(rocmon_gpulist[j]), rocmon_getEventName(gid, i), rocmon_getResult(j, gid, i), rocmon_getLastResult(j, gid, i)); + } + } + } +} +#endif + + +/* +Timeline mode +*/ +static void* appdaemon_timeline_main(void* arg) +{ + int stop = 0; + int target_delay_ms = *((int*)arg); + ; + + while (1) + { + usleep(target_delay_ms * 1E3); + + // Check stop status + pthread_mutex_lock(&stopMutex); + stop = stopIssued; + pthread_mutex_unlock(&stopMutex); + if (stop > 0) break; + +#ifdef LIKWID_NVMON + appdaemon_read_nvmon(); +#endif +#ifdef LIKWID_ROCMON + appdaemon_read_rocmon(); +#endif + } +} + + +/* +Main +*/ +int __libc_start_main(int (*main) (int,char **,char **), + int argc,char **ubp_av, + void (*init) (void), + void (*fini)(void), + void (*rtld_fini)(void), + void (*stack_end)) { + int ret = 0; + int (*original__libc_start_main)(int (*main) (int,char **,char **), + int argc,char **ubp_av, + void (*init) (void), + void (*fini)(void), + void (*rtld_fini)(void), + void (*stack_end)); + + mlockall(MCL_CURRENT); + munlockall(); + atexit(after_main); + + + original__libc_start_main = dlsym(RTLD_NEXT, "__libc_start_main"); + + prepare_ldpreload(); + + // Get timeline mode info + char* timelineStr = getenv("LIKWID_INTERVAL"); + int timelineInterval = -1; // in ms + if (timelineStr != NULL) + { + timelineInterval = atoi(timelineStr); + } + if (timelineInterval == 0) + { + fprintf(stderr, "Invalid timeline interval\n"); + return -1; + } + + // Open output file + char* outputFilename = getenv("LIKWID_OUTPUTFILE"); + if (outputFilename == NULL) + { + output_file = stderr; + } else { + output_file = fopen(outputFilename,"w"); + } + + if (output_file == NULL) + { + fprintf(stderr, "Cannot open file %s\n", outputFilename); + fprintf(stderr, "%s", strerror(errno)); + return -1; + } + fprintf(output_file, "Backend, GPU, Time, Event, Full Value, Last Value\n"); + +#ifdef LIKWID_NVMON + char* nvEventStr = getenv("LIKWID_NVMON_EVENTS"); + char* nvGpuStr = getenv("LIKWID_NVMON_GPUS"); + if (nvEventStr && nvGpuStr) + { + ret = appdaemon_setup_nvmon(nvGpuStr, nvEventStr); + if (!ret) + { + appdaemon_register_exit(appdaemon_close_nvmon); + } + } +#endif + +#ifdef LIKWID_ROCMON + char* rocmonEventStr = getenv("LIKWID_ROCMON_EVENTS"); + char* rocmonGpuStr = getenv("LIKWID_ROCMON_GPUS"); + if (rocmonEventStr && rocmonGpuStr) + { + ret = appdaemon_setup_rocmon(rocmonGpuStr, rocmonEventStr); + if (!ret) + { + appdaemon_register_exit(appdaemon_close_rocmon); + } + } +#endif + + // Start timeline thread + if (timelineInterval >= 0) + { + pthread_t tid; + ret = pthread_create(&tid, NULL, &appdaemon_timeline_main, &timelineInterval); + if (ret < 0) + { + fprintf(stderr, "Failed to create timeline thread\n"); + return -1; + } + } + + return original__libc_start_main(main,argc,ubp_av, + init,fini,rtld_fini,stack_end); +} + diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua index 7759f0f45..b58536143 100644 --- a/src/applications/likwid-perfctr.lua +++ b/src/applications/likwid-perfctr.lua @@ -35,10 +35,14 @@ package.path = '/share/lua/?.lua;' .. package.path local likwid = require("likwid") print_stdout = print -print_stderr = function(...) for k,v in pairs({...}) do io.stderr:write(v .. "\n") end io.stderr:flush() end +print_stderr = function(...) + for k, v in pairs({ ... }) do io.stderr:write(v .. "\n") end + io.stderr:flush() +end local function version() - print_stdout(string.format("likwid-perfctr -- Version %d.%d.%d (commit: %s)",likwid.version,likwid.release,likwid.minor,likwid.commit)) + print_stdout(string.format("likwid-perfctr -- Version %d.%d.%d (commit: %s)", likwid.version, likwid.release, + likwid.minor, likwid.commit)) end local function examples() @@ -57,6 +61,12 @@ local function examples() io.stdout:write("It is possible to combine CPU and GPU measurements (with MarkerAPI and NVMarkerAPI):\n") io.stdout:write("likwid-perfctr -C 2 -g CLOCK -G 1 -W FLOPS_DP -m ./a.out\n") end + if likwid.rocmSupported() then + io.stdout:write("Run command and measure on GPU 1 the performance group PCI (Only with ROCmMarkerAPI):\n") + io.stdout:write("likwid-perfctr -I 1 -R PCI -m ./a.out\n") + io.stdout:write("It is possible to combine CPU and GPU measurements (with MarkerAPI and ROCmMarkerAPI):\n") + io.stdout:write("likwid-perfctr -C 2 -g CLOCK -I 1 -R PCI -m ./a.out\n") + end end local function usage(config) @@ -70,12 +80,18 @@ local function usage(config) io.stdout:write("-C \t\t Processor ids to pin threads and measure, e.g. 1,2-4,8\n") io.stdout:write("\t\t\t For information about the syntax, see likwid-pin\n") if likwid.nvSupported() then - io.stdout:write("-G, --gpus \t List of GPUs to monitor\n") + io.stdout:write("-G, --gpus \t List of CUDA GPUs to monitor\n") + end + if likwid.rocmSupported() then + io.stdout:write("-I \t\t List of ROCm GPUs to monitor\n") end io.stdout:write("-g, --group \t Performance group or custom event set string for CPU monitoring\n") if likwid.nvSupported() then io.stdout:write("-W, --gpugroup \t Performance group or custom event set string for GPU monitoring\n") end + if likwid.rocmSupported() then + io.stdout:write("-R \t\t Performance group or custom event set string for ROCm GPU monitoring\n") + end io.stdout:write("-H\t\t\t Get group help (together with -g switch)\n") io.stdout:write("-s, --skip \t Bitmask with threads to skip\n") io.stdout:write("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon\n") @@ -89,12 +105,15 @@ local function usage(config) io.stdout:write("-S