Skip to content

Commit

Permalink
Fix multiple bugs and warnings in nvmon
Browse files Browse the repository at this point in the history
Bugs/warnings fixed:
- CUPTI errors would cause segfault due to the wrong
  cuptiGetResultString function begin called.
- a few signed/unsigned comparisons
- undefined behavior due to missing return statements
- segmentation fault in CUDA device name allocation (off-by-one)
- show compile error for CUDA 12+ with perfworks (still needs fix)
  • Loading branch information
ipatix committed Nov 8, 2024
1 parent 5030863 commit 9e14e9b
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 67 deletions.
51 changes: 27 additions & 24 deletions src/includes/nvmon_cupti.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void (*_dl_non_dynamic_init) (void) __attribute__ ((weak));
CUptiResult _status = (call); \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
(*cuptiGetResultString)(_status, &errstr); \
(*cuptiGetResultStringPtr)(_status, &errstr); \
fprintf(stderr, "Error: function %s failed with error %s.\n", #call, errstr); \
handleerror; \
} \
Expand Down Expand Up @@ -316,8 +316,8 @@ void nvmon_cupti_freeDevice(NvmonDevice_t dev)
int
nvmon_cupti_createDevice(int id, NvmonDevice *dev)
{
int j = 0, k = 0, c = 0;
int numDomains = 0;
int c = 0;
unsigned numDomains = 0;
CUpti_EventDomainID* eventDomainIds = NULL;
int eventIdx = 0;
uint32_t totalEvents = 0;
Expand Down Expand Up @@ -356,7 +356,7 @@ nvmon_cupti_createDevice(int id, NvmonDevice *dev)

// Count the events in all domains to allocate the event list
dev->numAllEvents = 0;
for (j = 0; j < numDomains; j++)
for (unsigned j = 0; j < numDomains; j++)
{
uint32_t domainNumEvents = 0;
CUpti_EventDomainID domainID = eventDomainIds[j];
Expand All @@ -376,7 +376,7 @@ nvmon_cupti_createDevice(int id, NvmonDevice *dev)
dev->eventHash = g_hash_table_new(g_str_hash, g_str_equal);
dev->evIdHash = g_hash_table_new(g_int64_hash, g_int64_equal);

for (j = 0; j < numDomains; j++)
for (unsigned j = 0; j < numDomains; j++)
{
uint32_t domainNumEvents = 0;
CUpti_EventDomainID domainID = eventDomainIds[j];
Expand All @@ -387,7 +387,7 @@ nvmon_cupti_createDevice(int id, NvmonDevice *dev)
CUpti_EventID* cuEventIds = malloc(tmpSize);
// Get the CUPTI events
CUPTI_CALL((*cuptiEventDomainEnumEventsPtr)(domainID, &tmpSize, cuEventIds), return -1);
for (k = 0; k < domainNumEvents; k++)
for (unsigned k = 0; k < domainNumEvents; k++)
{
CUpti_EventID eventId = cuEventIds[k];
// Name and description are limited in length
Expand Down Expand Up @@ -494,6 +494,8 @@ int nvmon_cupti_getEventsOfGpu(int gpuId, NvmonEventList_t* list)
ret = snprintf(out->limit, 9, "GPU");
if (ret > 0) out->limit[ret] = '\0';
break;
default:
break;
}
}
}
Expand Down Expand Up @@ -623,11 +625,11 @@ nvmon_cupti_addEventSets(NvmonDevice_t device, const char* eventString)
CU_CALL((*cuCtxPopCurrentPtr)(&device->context), return -EFAULT);
}
device->numNvEventSets++;
return 0;
}

int nvmon_cupti_setupCounters(NvmonDevice_t device, NvmonEventSet* eventSet)
{
int err = 0;
int popContext = 0;
int oldDevId = -1;
CUpti_EventGroupSets * cuEventSets = NULL;
Expand All @@ -648,7 +650,7 @@ int nvmon_cupti_setupCounters(NvmonDevice_t device, NvmonEventSet* eventSet)


size_t grpEventIdsSize = eventSet->numberOfEvents * sizeof(CUpti_EventID);
CUPTI_CALL((*cuptiEventGroupSetsCreatePtr)(device->context, grpEventIdsSize, eventSet->cuEventIDs, &cuEventSets), err = -1;);
CUPTI_CALL((*cuptiEventGroupSetsCreatePtr)(device->context, grpEventIdsSize, eventSet->cuEventIDs, &cuEventSets), return -1;);
// Allocate temporary array to hold the group event IDs
CUpti_EventID *grpEventIds = malloc(grpEventIdsSize);
if (!grpEventIds)
Expand Down Expand Up @@ -694,13 +696,13 @@ int nvmon_cupti_setupCounters(NvmonDevice_t device, NvmonEventSet* eventSet)
CUpti_EventGroup curGroup;
uint32_t curNumInstances = 0, curNumTotalInstances = 0;
CUpti_EventGroupSet *curEventGroupSet;
for (int j = 0; j < cuEventSets->numSets; j++)
for (unsigned j = 0; j < cuEventSets->numSets; j++)
{
size_t sizeofuint32t = sizeof(uint32_t);
uint32_t numGEvents = 0, numGInstances = 0, numTotalGInstances = 0;
CUpti_EventGroupSet* groupset = &cuEventSets->sets[j];

for (int k = 0; k < groupset->numEventGroups; k++)
for (unsigned k = 0; k < groupset->numEventGroups; k++)
{
uint32_t one = 1;
CUpti_EventGroup group = groupset->eventGroups[k];
Expand All @@ -713,7 +715,7 @@ int nvmon_cupti_setupCounters(NvmonDevice_t device, NvmonEventSet* eventSet)
// Get instance count for a group
CUPTI_CALL((*cuptiEventGroupGetAttributePtr)(group, CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &sizeofuint32t, &numGInstances), return -EFAULT);

for (int l = 0; l < numGEvents; l++)
for (unsigned l = 0; l < numGEvents; l++)
{
for (int m = 0; m < eventSet->numberOfEvents; m++)
{
Expand Down Expand Up @@ -823,27 +825,28 @@ int nvmon_cupti_startCounters(NvmonDevice_t device)
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Pop Context %ld for device %d, device->context, device->deviceId);
CU_CALL((*cuCtxPopCurrentPtr)(&device->context), return -EFAULT);
}
return 0;
}


int nvmon_cupti_stopCounters(NvmonDevice_t device)
{
int i = 0, j = 0, k = 0;
(void)device; // unused

int oldDevId = -1;
uint64_t timestamp = 0;
CUcontext curContext;


CUDA_CALL((*cudaGetDevicePtr)(&oldDevId), return -EFAULT);
// Take the timestamp, we assign it later for all devices
CUPTI_CALL((*cuptiGetTimestampPtr)(&timestamp), return -EFAULT);


for (i = 0; i < nvGroupSet->numberOfGPUs; i++)
for (int i = 0; i < nvGroupSet->numberOfGPUs; i++)
{
int popContext = 0;
uint32_t one = 1;
int maxTotalInstances = 0;
unsigned maxTotalInstances = 0;
size_t valuesSize = 0;
NvmonDevice_t device = &nvGroupSet->gpus[i];
if (device->deviceId != oldDevId)
Expand All @@ -864,21 +867,21 @@ int nvmon_cupti_stopCounters(NvmonDevice_t device)
// Are we in the proper context?
popContext = check_nv_context(device, curContext);

for (j = 0; j < device->numActiveEvents; j++)
for (int j = 0; j < device->numActiveEvents; j++)
{
maxTotalInstances = MAX(maxTotalInstances, device->activeEvents[j].numTotalInstances);
}
uint64_t *tmpValues = (uint64_t *) malloc(maxTotalInstances * sizeof(uint64_t));

for (j = 0; j < device->numActiveEvents; j++)
for (int j = 0; j < device->numActiveEvents; j++)
{
NvmonActiveEvent_t event = &device->activeEvents[j];
valuesSize = sizeof(uint64_t) * event->numTotalInstances;
memset(tmpValues, 0, valuesSize);
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Read Grp %ld Ev %ld for device %d, event->cuGroup, event->cuEventId, device->deviceId);
CUPTI_CALL((*cuptiEventGroupReadEventPtr)(event->cuGroup, CUPTI_EVENT_READ_FLAG_NONE, event->cuEventId, &valuesSize, tmpValues), free(tmpValues); return -EFAULT);
uint64_t valuesum = 0;
for (k = 0; k < event->numInstances; k++)
for (unsigned k = 0; k < event->numInstances; k++)
{
valuesum += tmpValues[k];
}
Expand All @@ -889,7 +892,7 @@ int nvmon_cupti_stopCounters(NvmonDevice_t device)
res->lastValue += res->stopValue - res->currentValue;
res->currentValue = (double)valuesum;
}
for (j = 0; j < device->numActiveCuGroups; j++)
for (int j = 0; j < device->numActiveCuGroups; j++)
{
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Disable group %ld, device->activeCuGroups[j]);
CUPTI_CALL((*cuptiEventGroupSetDisablePtr)(device->activeCuGroups[j]), return -EFAULT);
Expand All @@ -901,20 +904,20 @@ int nvmon_cupti_stopCounters(NvmonDevice_t device)
CU_CALL((*cuCtxPopCurrentPtr)(&device->context), return -EFAULT);
}
}
return 0;
}


int nvmon_cupti_readCounters(NvmonDevice_t device)
{
int j = 0, k = 0;
int oldDevId = -1;
uint64_t timestamp = 0;
CUcontext curContext;
size_t sizeofuint32num = sizeof(uint32_t);
int maxTotalInstances = 0;
unsigned maxTotalInstances = 0;


for (j = 0; j < device->numActiveEvents; j++)
for (int j = 0; j < device->numActiveEvents; j++)
{
maxTotalInstances = MAX(maxTotalInstances, device->activeEvents[j].numTotalInstances);
}
Expand Down Expand Up @@ -961,7 +964,7 @@ int nvmon_cupti_readCounters(NvmonDevice_t device)
CU_CALL((*cuCtxSynchronizePtr)(), return -EFAULT);
NvmonEventSet* nvEventSet = &device->nvEventSets[nvGroupSet->activeGroup];

for (j = 0; j < device->numActiveEvents; j++)
for (int j = 0; j < device->numActiveEvents; j++)
{
NvmonActiveEvent_t event = &device->activeEvents[j];
// Empty space for instance values
Expand All @@ -972,7 +975,7 @@ int nvmon_cupti_readCounters(NvmonDevice_t device)
CUPTI_CALL((*cuptiEventGroupReadEventPtr)(event->cuGroup, CUPTI_EVENT_READ_FLAG_NONE, event->cuEventId, &valuesSize, tmpValues), return -EFAULT);
// Sum all instance values
uint64_t valuesum = 0;
for (k = 0; k < event->numInstances; k++)
for (unsigned k = 0; k < event->numInstances; k++)
{
valuesum += tmpValues[k];
}
Expand Down
4 changes: 4 additions & 0 deletions src/includes/nvmon_perfworks.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
#ifndef LIKWID_NVMON_PERFWORKS_H
#define LIKWID_NVMON_PERFWORKS_H

#if defined(CUDART_VERSION) && CUDART_VERSION >= 12000
#error "NVMON Perfworks is currently not supported on CUDA 12+"
#endif

#if defined(CUDART_VERSION) && CUDART_VERSION > 10000

#include <cuda.h>
Expand Down
3 changes: 2 additions & 1 deletion src/includes/nvmon_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ typedef struct {
typedef enum {
LIKWID_NVMON_CUPTI_BACKEND = 0,
LIKWID_NVMON_PERFWORKS_BACKEND,
LIKWID_NVMON_BACKEND_COUNT,
} NvmonBackends;

typedef struct {
Expand Down Expand Up @@ -200,7 +201,7 @@ typedef struct {
int numberOfGPUs; /*!< \brief Amount of GPUs in \a gpus */
NvmonDevice* gpus; /*!< \brief List of GPUs */
int numberOfBackends;
NvmonFunctions* backends[3];
NvmonFunctions* backends[LIKWID_NVMON_BACKEND_COUNT];

int numGroupSources;
NvmonGroupSourceInfo* groupSources;
Expand Down
17 changes: 5 additions & 12 deletions src/nvmon.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ int likwid_nvmon_verbosity = DEBUGLEV_ONLY_ERROR;

#include <nvmon_cupti.h>
#include <nvmon_nvml.h>
//#include <nvmon_perfworks.h>

#include <nvmon_perfworks.h>

LikwidNvResults* gMarkerResults = NULL;
int gMarkerRegions = 0;
Expand Down Expand Up @@ -118,7 +117,7 @@ nvmon_init(int nrGpus, const int* gpuIds)

init_configuration();

nvGroupSet = (NvmonGroupSet*) malloc(sizeof(NvmonGroupSet));
nvGroupSet = calloc(1, sizeof(NvmonGroupSet));
if (nvGroupSet == NULL)
{
ERROR_PLAIN_PRINT(Cannot allocate group descriptor);
Expand Down Expand Up @@ -327,7 +326,6 @@ nvmon_getEventsOfGpu(int gpuId, NvmonEventList_t* list)
return -EINVAL;
}

//err = nvmon_cupti_functions.getEventList(available, list);
if (gtopo->devices[available].ccapMajor < 7)
{
if (nvmon_cupti_functions.getEventList)
Expand All @@ -337,12 +335,10 @@ nvmon_getEventsOfGpu(int gpuId, NvmonEventList_t* list)
}
else
{
#ifdef LIKWID_NVMON_PERFWORKS_H
if (nvmon_perfworks_functions.getEventList)
{
err = nvmon_perfworks_functions.getEventList(available, list);
}
#endif
}

// Get nvml events and merge lists
Expand Down Expand Up @@ -450,7 +446,6 @@ nvmon_initEventSourceLookupMaps(int gid, int gpuId)
GroupInfo* group = &nvGroupSet->groups[gid];

// Allocate memory
NvmonGroupSourceInfo* info;
if (gid >= nvGroupSet->numGroupSources)
{
int* tmpSourceTypes = (int*) malloc(group->nevents * sizeof(int));
Expand All @@ -475,7 +470,6 @@ nvmon_initEventSourceLookupMaps(int gid, int gpuId)
return -ENOMEM;
}

info = tmpInfo;
nvGroupSet->groupSources = tmpInfo;
nvGroupSet->groupSources[gid].numEvents = group->nevents;
nvGroupSet->groupSources[gid].sourceTypes = tmpSourceTypes;
Expand Down Expand Up @@ -524,7 +518,6 @@ int
nvmon_addEventSet(const char* eventCString)
{
int i = 0, j = 0, k = 0, l = 0, m = 0, err = 0;
int isPerfGroup = 0;

int devId = 0;
int configuredEvents = 0;
Expand Down Expand Up @@ -620,7 +613,6 @@ nvmon_addEventSet(const char* eventCString)
ERROR_PRINT(Cannot read performance group %s, eventCString);
return err;
}
isPerfGroup = 1;
}

// Build source lookup
Expand Down Expand Up @@ -1140,6 +1132,7 @@ int nvmon_getGroups(int gpuId, char*** groups, char*** shortinfos, char*** longi
int nvmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos)
{
perfgroup_returnGroups(nrgroups, groups, shortinfos, longinfos);
return 0;
}

int nvmon_getNumberOfMetrics(int groupId)
Expand Down Expand Up @@ -1355,7 +1348,7 @@ nvmon_readMarkerFile(const char* filename)
return -ENOMEM;
}
gMarkerRegions = regions;
for ( uint32_t i=0; i < regions; i++ )
for (int i = 0; i < regions; i++)
{
regionGPUs[i] = 0;
gMarkerResults[i].gpuCount = gpus;
Expand Down Expand Up @@ -1450,7 +1443,7 @@ nvmon_readMarkerFile(const char* filename)
}
}
}
for ( uint32_t i=0; i < regions; i++ )
for (int i = 0; i < regions; i++)
{
gMarkerResults[i].gpuCount = regionGPUs[i];
}
Expand Down
Loading

0 comments on commit 9e14e9b

Please sign in to comment.