Skip to content

Commit

Permalink
PR #13674 from gilpazintel: changes to support CUDA build w/o GPU in …
Browse files Browse the repository at this point in the history
…runtime
  • Loading branch information
Nir-Az authored Jan 21, 2025
2 parents 0cf45c0 + db36030 commit 6128f3f
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 38 deletions.
30 changes: 17 additions & 13 deletions src/linux/backend-hid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "types.h"

#include <rsutils/string/from.h>
#include "rsutils/accelerators/gpu.h"

#include <thread>
#include <chrono>
Expand Down Expand Up @@ -1232,23 +1233,26 @@ namespace librealsense
for (auto& elem : common_sensors)
{
hid_device_info hid_dev_info{};
if(!get_hid_device_info(elem.c_str(), hid_dev_info))
if (!get_hid_device_info(elem.c_str(), hid_dev_info))
{
#ifdef RS2_USE_CUDA
/* On the Jetson TX, ina3221x is the power monitor (I2C bus)
This code is checking the IIA device directory, but tries to compare as USB HID device
The ina3221x is not a HID device. Check here to avoid spamming the console.
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
std::string device_path_str(elem.c_str());
device_path_str+="/";
std::string dev_name;
std::ifstream(device_path_str + "name") >> dev_name;
if (dev_name != std::string("ina3221x")) {
LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
if (rsutils::rs2_is_gpu_available())
{
/* On the Jetson TX, ina3221x is the power monitor (I2C bus)
This code is checking the IIA device directory, but tries to compare as USB HID device
The ina3221x is not a HID device. Check here to avoid spamming the console.
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
std::string device_path_str(elem.c_str());
device_path_str += "/";
std::string dev_name;
std::ifstream(device_path_str + "name") >> dev_name;
if (dev_name != std::string("ina3221x")) {
LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
}
continue;
}
#else
LOG_INFO("Failed to read busnum/devnum. Device Path: " << elem);
#endif
LOG_INFO("Failed to read busnum/devnum. Device Path: " << elem);
continue;
}
action(hid_dev_info);
Expand Down
11 changes: 8 additions & 3 deletions src/linux/backend-v4l2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

#include <sys/signalfd.h>
#include <signal.h>
#include "rsutils/accelerators/gpu.h"

#pragma GCC diagnostic ignored "-Woverflow"

const size_t MAX_DEV_PARENT_DIR = 10;
Expand Down Expand Up @@ -755,9 +757,12 @@ namespace librealsense
if (!is_usb_path_valid(video_path, dev_name, busnum, devnum, devpath))
{
#ifndef RS2_USE_CUDA
/* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
if (rsutils::rs2_is_gpu_available())
{
/* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
}
#endif
throw linux_backend_exception("Failed to read busnum/devnum of usb device");
}
Expand Down
10 changes: 8 additions & 2 deletions src/proc/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

#if defined(RS2_USE_CUDA)
#include "proc/cuda/cuda-align.h"
#elif defined(__SSSE3__)
#include "rsutils/accelerators/gpu.h"
#endif
#if defined(__SSSE3__)
#include "proc/sse/sse-align.h"
#endif
#include "proc/neon/neon-align.h"
Expand All @@ -25,8 +27,12 @@ namespace librealsense
std::shared_ptr<align> align::create_align(rs2_stream align_to)
{
#if defined(RS2_USE_CUDA)
if (rsutils::rs2_is_gpu_available())
{
return std::make_shared<librealsense::align_cuda>(align_to);
#elif defined(__SSSE3__)
}
#endif
#if defined(__SSSE3__)
return std::make_shared<librealsense::align_sse>(align_to);
#elif defined(__ARM_NEON) && ! defined(ANDROID)
return std::make_shared<librealsense::align_neon>(align_to);
Expand Down
8 changes: 6 additions & 2 deletions src/proc/color-formats-converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/accelerators/gpu.h"
#endif
#ifdef __SSSE3__
#include <tmmintrin.h> // For SSSE3 intrinsics
Expand Down Expand Up @@ -57,8 +58,11 @@ namespace librealsense
auto n = width * height;
assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
#ifdef RS2_USE_CUDA
rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
return;
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
return;
}
#endif
#if defined __SSSE3__ && ! defined ANDROID
static bool do_avx = has_avx();
Expand Down
23 changes: 15 additions & 8 deletions src/proc/depth-formats-converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/accelerators/gpu.h"
#endif

namespace librealsense
Expand All @@ -17,11 +18,14 @@ namespace librealsense
auto in = reinterpret_cast<const uint16_t*>(source);
auto out_ir = reinterpret_cast<uint8_t *>(dest[1]);
#ifdef RS2_USE_CUDA
rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
#else
for (int i = 0; i < count; ++i) *out_ir++ = *in++ >> 2;
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
}
else
#endif
for (int i = 0; i < count; ++i) *out_ir++ = *in++ >> 2;
std::memcpy( dest[0], in, count * 2 );
}

Expand All @@ -31,11 +35,14 @@ namespace librealsense
auto in = reinterpret_cast<const uint16_t*>(source);
auto out_ir = reinterpret_cast<uint16_t*>(dest[1]);
#ifdef RS2_USE_CUDA
rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
#else
for (int i = 0; i < count; ++i) *out_ir++ = *in++ << 6;
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
}
else
#endif
for (int i = 0; i < count; ++i) *out_ir++ = *in++ << 6;
std::memcpy( dest[0], in, count * 2 );
}

Expand Down
7 changes: 6 additions & 1 deletion src/proc/pointcloud.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#ifdef RS2_USE_CUDA
#include "proc/cuda/cuda-pointcloud.h"
#include "rsutils/accelerators/gpu.h"
#endif
#ifdef __SSSE3__
#include "proc/sse/sse-pointcloud.h"
Expand Down Expand Up @@ -396,8 +397,12 @@ namespace librealsense
std::shared_ptr<pointcloud> pointcloud::create()
{
#ifdef RS2_USE_CUDA
if (rsutils::rs2_is_gpu_available())
{
return std::make_shared<librealsense::pointcloud_cuda>();
#elif defined(__SSSE3__)
}
#endif
#ifdef __SSSE3__
return std::make_shared<librealsense::pointcloud_sse>();
#elif defined(__ARM_NEON) && ! defined ANDROID
return std::make_shared<librealsense::pointcloud_neon>();
Expand Down
10 changes: 7 additions & 3 deletions src/proc/y12i-to-y16y16-mipi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "stream.h"
#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/accelerators/gpu.h"
#endif

namespace librealsense
Expand All @@ -16,12 +17,15 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi*>(source));
return;
}
#endif
split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
[](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
[](const y12i_pixel_mipi& p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
#endif
}

y12i_to_y16y16_mipi::y12i_to_y16y16_mipi(int left_idx, int right_idx)
Expand Down
10 changes: 7 additions & 3 deletions src/proc/y12i-to-y16y16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "stream.h"
#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/accelerators/gpu.h"
#endif

namespace librealsense
Expand All @@ -15,12 +16,15 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel*>(source));
return;
}
#endif
split_frame(dest, count, reinterpret_cast<const y12i_pixel*>(source),
[](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
[](const y12i_pixel & p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
#endif
}

y12i_to_y16y16::y12i_to_y16y16(int left_idx, int right_idx)
Expand Down
10 changes: 7 additions & 3 deletions src/proc/y8i-to-y8y8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/accelerators/gpu.h"
#endif

namespace librealsense
Expand All @@ -17,12 +18,15 @@ namespace librealsense
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel*>(source));
return;
}
#endif
split_frame(dest, count, reinterpret_cast<const y8i_pixel*>(source),
[](const y8i_pixel & p) -> uint8_t { return p.l; },
[](const y8i_pixel & p) -> uint8_t { return p.r; });
#endif
}

y8i_to_y8y8::y8i_to_y8y8(int left_idx, int right_idx) :
Expand Down
10 changes: 10 additions & 0 deletions third-party/rsutils/include/rsutils/accelerators/gpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2025 Intel Corporation. All Rights Reserved.

#pragma once

namespace rsutils {

bool rs2_is_gpu_available();

} // namespace rsutils
45 changes: 45 additions & 0 deletions third-party/rsutils/src/rsutilgpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2025 Intel Corporation. All Rights Reserved.

#include "rsutils/accelerators/gpu.h"
#include <rsutils/easylogging/easyloggingpp.h>

#ifdef RS2_USE_CUDA
#include <cuda_runtime.h>
#endif

namespace rsutils {

class GPUChecker {
public:
static bool is_gpu_available() {
static int gpuDeviceCount = -1;
#ifdef RS2_USE_CUDA

if (gpuDeviceCount < 0)
{
cudaError_t error = cudaGetDeviceCount(&gpuDeviceCount);
if (error != cudaSuccess) {
LOG_ERROR("cudaGetDeviceCount failed: " << cudaGetErrorString(error));
gpuDeviceCount = 0; // Set to 0 to avoid repeated error logging
}
if (gpuDeviceCount <= 0)
{
LOG_INFO("Avoid CUDA execution as no NVIDIA GPU found.");
}
else
{
LOG_INFO("Found " << gpuDeviceCount << " NVIDIA GPU.");
}
}
#endif
return gpuDeviceCount > 0;
}
};

bool rs2_is_gpu_available() {
return rsutils::GPUChecker::is_gpu_available();
}

} // namespace rsutils

0 comments on commit 6128f3f

Please sign in to comment.