Skip to content

Commit

Permalink
Add write result file function for Nvidia GPU and AMD GPU. (RRZE-HPC#586
Browse files Browse the repository at this point in the history
)

* Add write result file function for Nvidia GPU and AMD GPU. See RRZE-HPC#584

* Fix bad naming, make local with GPUs and enable appDaemon build if GPU build
  • Loading branch information
TomTheBear authored Dec 19, 2023
1 parent b2a2113 commit c90c67d
Show file tree
Hide file tree
Showing 13 changed files with 225 additions and 108 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ local: $(L_APPS) likwid.lua
@echo "===> Setting Lua scripts to run from current directory"
@PWD=$(shell pwd)
@for APP in $(L_APPS); do \
sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s/<GITCOMMIT>/$(GITCOMMIT)/g" -e "s/<MINOR>/$(MINOR)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" -e "s+$(PREFIX)/bin/likwid-pin+$(PWD)/likwid-pin+" -e "s+$(PREFIX)/bin/likwid-perfctr+$(PWD)/likwid-perfctr+" $$APP; \
sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s/<GITCOMMIT>/$(GITCOMMIT)/g" -e "s/<MINOR>/$(MINOR)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" -e "s+$(PREFIX)/bin/likwid-pin+$(PWD)/likwid-pin+" -e "s+$(PREFIX)/bin/likwid-perfctr+$(PWD)/likwid-perfctr+" -e "s+$(PREFIX)/lib+$(PWD)+" $$APP; \
chmod +x $$APP; \
done
@sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/lib+$(PWD)+g" -e "s+$(PREFIX)/share/likwid/perfgroups+$(PWD)/groups+g" -e "s/<GITCOMMIT>/$(GITCOMMIT)/g" -e "s/<MINOR>/$(MINOR)/g" likwid.lua;
Expand Down
2 changes: 2 additions & 0 deletions make/config_defines.mk
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,12 @@ endif

ifeq ($(strip $(NVIDIA_INTERFACE)),true)
DEFINES += -DLIKWID_WITH_NVMON
BUILDAPPDAEMON = true
endif

ifeq ($(strip $(ROCM_INTERFACE)),true)
DEFINES += -DLIKWID_WITH_ROCMON -D__HIP_PLATFORM_HCC__
BUILDAPPDAEMON = true
endif

ifeq ($(strip $(BUILDDAEMON)),true)
Expand Down
2 changes: 1 addition & 1 deletion src/access-daemon/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ CFLAGS += -std=c99 -fPIC -pie -fPIE -fstack-protector
ifeq ($(COMPILER),GCCX86)
CFLAGS += -m32
endif
CPPFLAGS := $(DEFINES) $(INCLUDES) -L$(PREFIX)/lib
CPPFLAGS := $(DEFINES) $(INCLUDES) -L../../lib

ifeq ($(COMPILER),GCCARMv8)
all:
Expand Down
4 changes: 2 additions & 2 deletions src/access-daemon/appDaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ static int parse_gpustr(char* gpuStr, int* numGpus, int** gpuIds)
{
// Create bstring
bstring bGpuStr = bfromcstr(gpuStr);

int (*ownatoi)(const char*) = atoi;
// Parse list
struct bstrList* gpuTokens = bsplit(bGpuStr,',');
int tmpNumGpus = gpuTokens->qty;
Expand All @@ -129,7 +129,7 @@ static int parse_gpustr(char* gpuStr, int* numGpus, int** gpuIds)
// Parse ids to int
for (int i = 0; i < tmpNumGpus; i++)
{
tmpGpuIds[i] = atoi(bdata(gpuTokens->entry[i]));
tmpGpuIds[i] = ownatoi(bdata(gpuTokens->entry[i]));
}

// Copy data
Expand Down
10 changes: 5 additions & 5 deletions src/applications/likwid-perfctr.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1541,14 +1541,14 @@ if use_marker == true then
---------------------------
if nvSupported and #cuda_event_string_list > 0 then
if likwid.access(nvMarkerFile, "e") >= 0 then
results, metrics = likwid.getNvMarkerResults(nvMarkerFile, markergpulist, nan2value)
results, metrics = likwid.getMarkerResultsCuda(nvMarkerFile, gpulist_cuda, nan2value)
if not results then
print_stderr("Failure reading GPU Marker API result file.")
print_stderr("Failure reading Nv Marker API result file.")
elseif #results == 0 then
print_stderr("No regions could be found in GPU Marker API result file.")
print_stderr("No regions could be found in Nv Marker API result file.")
else
for r = 1, #results do
likwid.printGpuOutput(results[r], metrics[r], gpulist_cuda, r, print_stats)
likwid.printOutputCuda(results[r], metrics[r], gpulist_cuda, r, print_stats)
end
end
likwid.destroyNvMarkerFile()
Expand All @@ -1561,7 +1561,7 @@ if use_marker == true then
---------------------------
if rocmSupported and #rocm_event_string_list > 0 then
if likwid.access(rocmMarkerFile, "e") >= 0 then
results, metrics = likwid.getMarkerResultsRocm(rocmMarkerFile, markerrocmgpulist, nan2value)
results, metrics = likwid.getMarkerResultsRocm(rocmMarkerFile, gpulist_rocm, nan2value)
if not results then
print_stderr("Failure reading ROCM Marker API result file.")
elseif #results == 0 then
Expand Down
83 changes: 17 additions & 66 deletions src/applications/likwid.lua
Original file line number Diff line number Diff line change
Expand Up @@ -202,17 +202,17 @@ likwid.nvGetNameOfCounter = likwid_nvGetNameOfCounter
likwid.nvSupported = likwid_nvSupported
likwid.readNvMarkerFile = likwid_readNvMarkerFile
likwid.destroyNvMarkerFile = likwid_destroyNvMarkerFile
likwid.nvMarkerNumRegions = nvmon_getNumberOfRegions
likwid.nvMarkerRegionGroup = nvmon_getGroupOfRegion
likwid.nvMarkerRegionTag = nvmon_getTagOfRegion
likwid.nvMarkerRegionEvents = likwid_markerRegionEvents
likwid.nvMarkerRegionMetrics = likwid_markerRegionMetrics
likwid.nvMarkerRegionGpulist = likwid_markerRegionGpulist
likwid.nvMarkerRegionGpus = likwid_markerRegionGpus
likwid.nvMarkerRegionTime = likwid_markerRegionTime
likwid.nvMarkerRegionCount = likwid_markerRegionCount
likwid.nvMarkerRegionResult = likwid_markerRegionResult
likwid.nvMarkerRegionMetric = likwid_markerRegionMetric
likwid.nvMarkerNumRegions = likwid_nvMarkerNumRegions
likwid.nvMarkerRegionGroup = likwid_nvMarkerRegionGroup
likwid.nvMarkerRegionTag = likwid_nvMarkerRegionTag
likwid.nvMarkerRegionEvents = likwid_nvMarkerRegionEvents
likwid.nvMarkerRegionMetrics = likwid_nvMarkerRegionMetrics
likwid.nvMarkerRegionGpulist = likwid_nvMarkerRegionGpulist
likwid.nvMarkerRegionGpus = likwid_nvMarkerRegionGpus
likwid.nvMarkerRegionTime = likwid_nvMarkerRegionTime
likwid.nvMarkerRegionCount = likwid_nvMarkerRegionCount
likwid.nvMarkerRegionResult = likwid_nvMarkerRegionResult
likwid.nvMarkerRegionMetric = likwid_nvMarkerRegionMetric
likwid.nvInit = likwid_nvInit
likwid.nvAddEventSet = likwid_nvAddEventSet
likwid.nvFinalize = likwid_nvFinalize
Expand Down Expand Up @@ -1367,59 +1367,10 @@ end
likwid.getArch = llikwid_getArch


local function getGpuMarkerResults(filename, gpulist, nan2value)
local gputopo = likwid.getGpuTopology()
local ret = likwid.readNvMarkerFile(filename)
if ret < 0 then
return nil, nil
elseif ret == 0 then
return {}, {}
end
if not nan2value then
nan2value = '-'
end
results = {}
metrics = {}
for i=1, likwid.nvMarkerNumRegions() do
local regionName = likwid.nvMarkerRegionTag(i)
local groupID = likwid.nvMarkerRegionGroup(i)
local regionGPUs = likwid.nvMarkerRegionGpus(i)
results[i] = {}
metrics[i] = {}
results[i][groupID] = {}
metrics[i][groupID] = {}
for k=1, likwid.nvMarkerRegionEvents(i) do
local eventName = likwid.nvGetNameOfEvent(groupID, k)
local counterName = likwid.nvGetNameOfCounter(groupID, k)
results[i][groupID][k] = {}
for j=1, regionGPUs do
results[i][groupID][k][j] = likwid.nvMarkerRegionResult(i,k,j)
if results[i][groupID][k][j] ~= results[i][groupID][k][j] then
results[i][groupID][k][j] = nan2value
end
end
end
if likwid.nvMarkerRegionMetrics(groupID) > 0 then
for k=1, likwid.nvMarkerRegionMetrics(groupID) do
local metricName = likwid.getNameOfMetric(groupID, k)
metrics[i][likwid.nvMarkerRegionGroup(i)][k] = {}
for j=1, regionGPUs do
metrics[i][groupID][k][j] = likwid.nvMarkerRegionMetric(i,k,j)
if metrics[i][groupID][k][j] ~= metrics[i][groupID][k][j] then
metrics[i][groupID][k][j] = nan2value
end
end
end
end
end
return results, metrics
end

likwid.getGpuMarkerResults = getGpuMarkerResults

local function printGpuOutput(results, metrics, gpulist, region, stats)
local function printOutputCuda(results, metrics, gpulist, region, stats)
local maxLineFields = 0
local gputopo = likwid.getGpuTopology()
local gputopo = likwid.getCudaTopology()
local regionName = likwid.nvMarkerRegionTag(region)
local regionGPUs = likwid.nvMarkerRegionGpus(region)
local cur_gpulist = gpulist
Expand Down Expand Up @@ -1572,10 +1523,10 @@ local function printGpuOutput(results, metrics, gpulist, region, stats)
end
end

likwid.printGpuOutput = printGpuOutput
likwid.printOutputCuda = printOutputCuda

local function getNvMarkerResults(filename, gpulist, nan2value)
local gputopo = likwid.getGpuTopology()
local function getMarkerResultsCuda(filename, gpulist, nan2value)
local gputopo = likwid.getCudaTopology()
local ret = likwid.readNvMarkerFile(filename)
if ret < 0 then
return nil, nil
Expand Down Expand Up @@ -1622,7 +1573,7 @@ local function getNvMarkerResults(filename, gpulist, nan2value)
return results, metrics
end

likwid.getNvMarkerResults = getNvMarkerResults
likwid.getMarkerResultsCuda = getMarkerResultsCuda

local function getMarkerResultsRocm(filename, gpulist, nan2value)
local gputopo = likwid.getGpuTopology_rocm()
Expand Down
49 changes: 31 additions & 18 deletions src/includes/likwid-marker.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,39 +108,43 @@ Shortcut for likwid_markerWriteFile() if compiled with -DLIKWID_PERFMON. Otherwi
*/
/*!
\def LIKWID_NVMARKER_INIT
Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
Shortcut for nvmon_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_THREADINIT
Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
No operation is performed, this macro exists only to be similar as CPU MarkerAPI
*/
/*!
\def LIKWID_NVMARKER_REGISTER(regionTag)
Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_START(regionTag)
Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_STOP(regionTag)
Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count)
Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_SWITCH
Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_RESET(regionTag)
Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_CLOSE
Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
Shortcut for nvmon_markerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/*!
\def LIKWID_NVMARKER_WRITE_FILE
Shortcut for nvmon_markerWriteFile() with \a filename if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
*/
/** @}*/

Expand All @@ -149,16 +153,18 @@ Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise
#define LIKWID_WITH_NVMON
#endif
#include <likwid.h>
#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit()
#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit()
#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup()
#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag)
#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag)
#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag)
#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose()
#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag)
#define LIKWID_NVMARKER_INIT nvmon_markerInit()
#define LIKWID_NVMARKER_THREADINIT
#define LIKWID_NVMARKER_SWITCH nvmon_markerNextGroup()
#define LIKWID_NVMARKER_REGISTER(regionTag) nvmon_markerRegisterRegion(regionTag)
#define LIKWID_NVMARKER_START(regionTag) nvmon_markerStartRegion(regionTag)
#define LIKWID_NVMARKER_STOP(regionTag) nvmon_markerStopRegion(regionTag)
#define LIKWID_NVMARKER_CLOSE nvmon_markerClose()
#define LIKWID_NVMARKER_RESET(regionTag) nvmon_markerResetRegion(regionTag)
#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \
likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count)
nvmon_markerGetRegion(regionTag, ngpus, nevents, events, time, count)
#define LIKWID_NVMARKER_WRITE_FILE(markerfile) \
nvmon_markerWriteFile(markerfile)
#else /* LIKWID_NVMON */
#define LIKWID_NVMARKER_INIT
#define LIKWID_NVMARKER_THREADINIT
Expand All @@ -169,6 +175,7 @@ Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise
#define LIKWID_NVMARKER_CLOSE
#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count)
#define LIKWID_NVMARKER_RESET(regionTag)
#define LIKWID_NVMARKER_WRITE_FILE(markerfile)
#endif /* LIKWID_NVMON */


Expand Down Expand Up @@ -211,6 +218,10 @@ Shortcut for rocmon_markerResetRegion() if compiled with -DLIKWID_ROCMON. Otherw
\def ROCMON_MARKER_CLOSE
Shortcut for rocmon_markerClose() if compiled with -DLIKWID_ROCMON. Otherwise no operation is performed
*/
/*!
\def ROCMON_MARKER_WRITE_FILE
Shortcut for rocmon_markerWriteFile() with \a filename if compiled with -DLIKWID_ROCMON. Otherwise no operation is performed
*/
/** @}*/

#ifdef LIKWID_ROCMON
Expand All @@ -227,6 +238,7 @@ Shortcut for rocmon_markerClose() if compiled with -DLIKWID_ROCMON. Otherwise no
#define ROCMON_MARKER_CLOSE rocmon_markerClose()
#define ROCMON_MARKER_RESET(regionTag) rocmon_markerResetRegion(regionTag)
#define ROCMON_MARKER_GET(regionTag, ngpus, nevents, events, time, count) rocmon_markerGetRegion(regionTag, ngpus, nevents, events, time, count)
#define ROCMON_MARKER_WRITE_FILE(filename) rocmon_markerWriteFile(filename)
#else /* LIKWID_ROCMON */
#define ROCMON_MARKER_INIT
#define ROCMON_MARKER_THREADINIT
Expand All @@ -237,6 +249,7 @@ Shortcut for rocmon_markerClose() if compiled with -DLIKWID_ROCMON. Otherwise no
#define ROCMON_MARKER_CLOSE
#define ROCMON_MARKER_GET(regionTag, nevents, events, time, count)
#define ROCMON_MARKER_RESET(regionTag)
#define ROCMON_MARKER_WRITE_FILE(filename)
#endif /* LIKWID_ROCMON */


Expand Down
25 changes: 25 additions & 0 deletions src/includes/likwid.h
Original file line number Diff line number Diff line change
Expand Up @@ -2327,6 +2327,13 @@ extern void nvmon_markerGetRegion(const char *regionTag, int *nr_gpus,
int *nr_events, double **events,
double *time, int *count)
__attribute__((visibility("default")));
/*! \brief Write the output file of the NvMarker API
@param [in] markerfile Filename for NvMarker API results
@return 0 or negative error number
*/
extern int nvmon_markerWriteFile(const char* markerfile)
__attribute__((visibility("default")));


/*! \brief Read the output file of the NvMarker API
@param [in] filename Filename with NvMarker API results
Expand Down Expand Up @@ -3099,6 +3106,24 @@ Reset the values of all configured counters and timers.
int rocmon_markerResetRegion(const char *regionTag)
__attribute__((visibility("default")));

/*! \brief Write measurement data to file
Write current values to file
@param markerfile [in] Filename for writing
@return Error code of write operation
*/
int rocmon_markerWriteFile(const char *markerfile)
__attribute__((visibility("default")));

/*! \brief Select next group to measure
Must be called in parallel region of the application to switch group on every
CPU.
*/
extern void rocmon_markerNextGroup(void)
__attribute__((visibility("default")));


/*! \brief Read the output file of the RocmonMarker API
@param [in] filename Filename with RocmonMarker API results
@return 0 or negative error number
Expand Down
Loading

0 comments on commit c90c67d

Please sign in to comment.