Skip to content

Commit

Permalink
Add run time compilation (#11225)
Browse files Browse the repository at this point in the history
Summary:
- Adds a CompiledModule abstraction on top of Cuda run time compilation.

- Adds a cache of run time compiled kernels. The cache returns a kernel immediately and leaves the kernel compiling in the background. The kernel's methods wait for the compilation to be ready.

- tests that runtime API and driver API streams are interchangeable when running a dynamically generated kernel.

- Add proper use of contexts, one per device. The contexts are needed because of using the driver API to handle run time compilation.

- Add device properties to the Device* struct.


Differential Revision: D64205005

Pulled By: oerling
  • Loading branch information
Orri Erling authored and facebook-github-bot committed Oct 21, 2024
1 parent 838b486 commit 16bd50b
Show file tree
Hide file tree
Showing 13 changed files with 750 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/linux-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:
"-DVELOX_ENABLE_GCS=ON"
"-DVELOX_ENABLE_ABFS=ON"
"-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON"
"-DVELOX_ENABLE_GPU=ON"
"-DVELOX_ENABLE_GPU=OFF"
"-DVELOX_MONO_LIBRARY=ON"
)
make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}"
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 a)
if(VELOX_ENABLE_S3)
# Set AWS_ROOT_DIR if you have a custom install location of AWS SDK CPP.
if(AWSSDK_ROOT_DIR)
set(CMAKE_PREFIX_PATH ${AWSSDK_ROOT_DIR})
list(APPEND CMAKE_PREFIX_PATH ${AWSSDK_ROOT_DIR})
endif()
find_package(AWSSDK REQUIRED COMPONENTS s3;identity-management)
add_definitions(-DVELOX_ENABLE_S3)
Expand Down Expand Up @@ -381,6 +381,7 @@ if(${VELOX_ENABLE_GPU})
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
endif()
include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
find_package(CUDAToolkit REQUIRED)
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
Expand Down
10 changes: 8 additions & 2 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ velox_add_library(
velox_wave_common
GpuArena.cpp
Buffer.cpp
Compile.cu
Cuda.cu
Exception.cpp
KernelCache.cpp
Type.cpp
ResultStaging.cpp)

velox_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)
velox_link_libraries(
velox_wave_common
velox_exception
velox_common_base
velox_type
CUDA::nvrtc)

if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
Expand Down
225 changes: 225 additions & 0 deletions velox/experimental/wave/common/Compile.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fmt/format.h>
#include <gflags/gflags.h>
#include <nvrtc.h>
#include "velox/experimental/wave/common/Cuda.h"
#include "velox/experimental/wave/common/CudaUtil.cuh"
#include "velox/experimental/wave/common/Exception.h"

DEFINE_string(
wavegen_architecture,
"compute_70",
"--gpu-architecture flag for generated code");

namespace facebook::velox::wave {

void nvrtcCheck(nvrtcResult result) {
if (result != NVRTC_SUCCESS) {
waveError(nvrtcGetErrorString(result));
}
}

class CompiledModuleImpl : public CompiledModule {
public:
CompiledModuleImpl(CUmodule module, std::vector<CUfunction> kernels)
: module_(module), kernels_(std::move(kernels)) {}

~CompiledModuleImpl() {
auto result = cuModuleUnload(module_);
if (result != CUDA_SUCCESS) {
LOG(ERROR) << "Error in unloading module " << result;
}
}

void launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) override;

KernelInfo info(int32_t kernelIdx) override;

private:
CUmodule module_;
std::vector<CUfunction> kernels_;
};

void addFlag(
const char* flag,
const char* value,
int32_t length,
std::vector<std::string>& data) {
std::string str(flag);
str.resize(str.size() + length + 1);
memcpy(str.data() + strlen(flag), value, length);
str.back() = 0;
data.push_back(std::move(str));
}

// Gets compiler options from the environment and appends them to 'opts''. The
// memory is owned by 'data'.
void getNvrtcOptions(
std::vector<const char*>& opts,
std::vector<std::string>& data) {
const char* includes = getenv("WAVE_NVRTC_INCLUDE_PATH");
if (includes && strlen(includes) > 0) {
for (;;) {
const char* end = strchr(includes, ':');
if (!end) {
addFlag("-I", includes, strlen(includes), data);
break;
}
addFlag("-I", includes, end - includes, data);
includes = end + 1;
}
}
const char* flags = getenv("WAVE_NVRTC_FLAGS");
if (flags && strlen(flags)) {
for (;;) {
auto end = strchr(flags, ' ');
if (!end) {
addFlag("", flags, strlen(flags), data);
break;
}
addFlag("", flags, end - flags, data);
flags = end + 1;
}
}
for (auto& str : data) {
opts.push_back(str.data());
}
}

std::shared_ptr<CompiledModule> CompiledModule::create(const KernelSpec& spec) {
nvrtcProgram prog;
nvrtcCreateProgram(
&prog,
spec.code.c_str(), // buffer
spec.filePath.c_str(), // name
spec.numHeaders, // numHeaders
spec.headers, // headers
spec.headerNames); // includeNames
for (auto& name : spec.entryPoints) {
nvrtcCheck(nvrtcAddNameExpression(prog, name.c_str()));
}
std::vector<const char*> opts;
std::vector<std::string> optsData;
#ifndef NDEBUG
optsData.push_back("-G");
#else
optsData.push_back("-O3");
#endif
getNvrtcOptions(opts, optsData);

auto compileResult = nvrtcCompileProgram(
prog, // prog
opts.size(), // numOptions
opts.data()); // options

size_t logSize;

nvrtcGetProgramLogSize(prog, &logSize);
std::string log;
log.resize(logSize);
nvrtcGetProgramLog(prog, log.data());

if (compileResult != NVRTC_SUCCESS) {
nvrtcDestroyProgram(&prog);
waveError(std::string("Cuda compilation error: ") + log);
}
// Obtain PTX from the program.
size_t ptxSize;
nvrtcCheck(nvrtcGetPTXSize(prog, &ptxSize));
std::string ptx;
ptx.resize(ptxSize);
nvrtcCheck(nvrtcGetPTX(prog, ptx.data()));
std::vector<std::string> loweredNames;
for (auto& entry : spec.entryPoints) {
const char* temp;
nvrtcCheck(nvrtcGetLoweredName(prog, entry.c_str(), &temp));
loweredNames.push_back(std::string(temp));
}

nvrtcDestroyProgram(&prog);
CUjit_option options[] = {
CU_JIT_INFO_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
char info[1024];
char error[1024];
uint32_t infoSize = sizeof(info);
uint32_t errorSize = sizeof(error);
void* values[] = {info, &infoSize, error, &errorSize};

CUmodule module;
auto loadResult = cuModuleLoadDataEx(
&module, ptx.data(), sizeof(values) / sizeof(void*), options, values);
if (loadResult != CUDA_SUCCESS) {
LOG(ERROR) << "Load error " << errorSize << " " << infoSize;
waveError(fmt::format("Error in load module: {} {}", info, error));
}
std::vector<CUfunction> funcs;
for (auto& name : loweredNames) {
funcs.emplace_back();
CU_CHECK(cuModuleGetFunction(&funcs.back(), module, name.c_str()));
}
return std::make_shared<CompiledModuleImpl>(module, std::move(funcs));
}

void CompiledModuleImpl::launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) {
auto result = cuLaunchKernel(
kernels_[kernelIdx],
numBlocks,
1,
1, // grid dim
numThreads,
1,
1, // block dim
shared,
reinterpret_cast<CUstream>(stream->stream()->stream),
args,
0);
CU_CHECK(result);
};

KernelInfo CompiledModuleImpl::info(int32_t kernelIdx) {
KernelInfo info;
auto f = kernels_[kernelIdx];
cuFuncGetAttribute(&info.numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, f);
cuFuncGetAttribute(
&info.sharedMemory, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, f);
cuFuncGetAttribute(
&info.maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, f);
int32_t max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 0);
info.maxOccupancy0 = max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 256 * 32);
info.maxOccupancy32 = max;
return info;
}

} // namespace facebook::velox::wave
Loading

0 comments on commit 16bd50b

Please sign in to comment.