PR #13674 from gilpazintel: changes to support CUDA build w/o GPU in …

…runtime
IntelRealSense · Jan 21, 2025 · 6128f3f · 6128f3f
2 parents 0cf45c0 + db36030
commit 6128f3f
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 38 deletions.
diff --git a/src/linux/backend-hid.cpp b/src/linux/backend-hid.cpp
@@ -7,6 +7,7 @@
 #include "types.h"
 
 #include <rsutils/string/from.h>
+#include "rsutils/accelerators/gpu.h"
 
 #include <thread>
 #include <chrono>
@@ -1232,23 +1233,26 @@ namespace librealsense
             for (auto& elem : common_sensors)
             {
                 hid_device_info hid_dev_info{};
-                if(!get_hid_device_info(elem.c_str(), hid_dev_info))
+                if (!get_hid_device_info(elem.c_str(), hid_dev_info))
                 {
 #ifdef RS2_USE_CUDA
-                    /* On the Jetson TX, ina3221x is the power monitor (I2C bus)
-                    This code is checking the IIA device directory, but tries to compare as USB HID device
-                    The ina3221x is not a HID device. Check here to avoid spamming the console.
-                    Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
-                    std::string device_path_str(elem.c_str());
-                    device_path_str+="/";
-                    std::string dev_name;
-                    std::ifstream(device_path_str + "name") >> dev_name;
-                    if (dev_name != std::string("ina3221x")) {
-                        LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
+                    if (rsutils::rs2_is_gpu_available())
+                    {
+                        /* On the Jetson TX, ina3221x is the power monitor (I2C bus)
+                        This code is checking the IIA device directory, but tries to compare as USB HID device
+                        The ina3221x is not a HID device. Check here to avoid spamming the console.
+                        Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
+                        std::string device_path_str(elem.c_str());
+                        device_path_str += "/";
+                        std::string dev_name;
+                        std::ifstream(device_path_str + "name") >> dev_name;
+                        if (dev_name != std::string("ina3221x")) {
+                            LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
+                        }
+                        continue;
                     }
-#else
-                    LOG_INFO("Failed to read busnum/devnum. Device Path: " << elem);
 #endif
+                    LOG_INFO("Failed to read busnum/devnum. Device Path: " << elem);
                     continue;
                 }
                 action(hid_dev_info);

diff --git a/src/linux/backend-v4l2.cpp b/src/linux/backend-v4l2.cpp
@@ -58,6 +58,8 @@
 
 #include <sys/signalfd.h>
 #include <signal.h>
+#include "rsutils/accelerators/gpu.h"
+
 #pragma GCC diagnostic ignored "-Woverflow"
 
 const size_t MAX_DEV_PARENT_DIR = 10;
@@ -755,9 +757,12 @@ namespace librealsense
             if (!is_usb_path_valid(video_path, dev_name, busnum, devnum, devpath))
             {
 #ifndef RS2_USE_CUDA
-               /* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
-               Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
-               LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
+                if (rsutils::rs2_is_gpu_available())
+                {
+                    /* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
+                    Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
+                    LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
+                }
 #endif
                throw linux_backend_exception("Failed to read busnum/devnum of usb device");
             }

diff --git a/src/proc/align.cpp b/src/proc/align.cpp
@@ -13,7 +13,9 @@
 
 #if defined(RS2_USE_CUDA)
 #include "proc/cuda/cuda-align.h"
-#elif defined(__SSSE3__)
+#include "rsutils/accelerators/gpu.h"
+#endif
+#if defined(__SSSE3__)
 #include "proc/sse/sse-align.h"
 #endif
 #include "proc/neon/neon-align.h"
@@ -25,8 +27,12 @@ namespace librealsense
     std::shared_ptr<align> align::create_align(rs2_stream align_to)
     {
         #if defined(RS2_USE_CUDA)
+        if (rsutils::rs2_is_gpu_available())
+        {
             return std::make_shared<librealsense::align_cuda>(align_to);
-        #elif defined(__SSSE3__)
+        }
+        #endif
+        #if defined(__SSSE3__)
             return std::make_shared<librealsense::align_sse>(align_to);
         #elif defined(__ARM_NEON) && ! defined(ANDROID)
             return std::make_shared<librealsense::align_neon>(align_to);

diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp
@@ -13,6 +13,7 @@
 
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
+#include "rsutils/accelerators/gpu.h"
 #endif
 #ifdef __SSSE3__
 #include <tmmintrin.h> // For SSSE3 intrinsics
@@ -57,8 +58,11 @@ namespace librealsense
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
 #ifdef RS2_USE_CUDA
-        rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
-        return;
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
+            return;
+        }
 #endif
 #if defined __SSSE3__ && ! defined ANDROID
         static bool do_avx = has_avx();

diff --git a/src/proc/depth-formats-converter.cpp b/src/proc/depth-formats-converter.cpp
@@ -7,6 +7,7 @@
 
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
+#include "rsutils/accelerators/gpu.h"
 #endif
 
 namespace librealsense
@@ -17,11 +18,14 @@ namespace librealsense
         auto in = reinterpret_cast<const uint16_t*>(source);
         auto out_ir = reinterpret_cast<uint8_t *>(dest[1]);
 #ifdef RS2_USE_CUDA
-        rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
-        in += count;
-#else
-        for (int i = 0; i < count; ++i) *out_ir++ = *in++ >> 2;
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
+            in += count;
+        }
+        else
 #endif
+        for (int i = 0; i < count; ++i) *out_ir++ = *in++ >> 2;
         std::memcpy( dest[0], in, count * 2 );
     }
 
@@ -31,11 +35,14 @@ namespace librealsense
         auto in = reinterpret_cast<const uint16_t*>(source);
         auto out_ir = reinterpret_cast<uint16_t*>(dest[1]);
 #ifdef RS2_USE_CUDA
-        rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
-        in += count;
-#else
-        for (int i = 0; i < count; ++i) *out_ir++ = *in++ << 6;
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
+            in += count;
+        }
+        else
 #endif
+        for (int i = 0; i < count; ++i) *out_ir++ = *in++ << 6;
         std::memcpy( dest[0], in, count * 2 );
     }
 

diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp
@@ -18,6 +18,7 @@
 
 #ifdef RS2_USE_CUDA
 #include "proc/cuda/cuda-pointcloud.h"
+#include "rsutils/accelerators/gpu.h"
 #endif
 #ifdef __SSSE3__
 #include "proc/sse/sse-pointcloud.h"
@@ -396,8 +397,12 @@ namespace librealsense
     std::shared_ptr<pointcloud> pointcloud::create()
     {
         #ifdef RS2_USE_CUDA
+        if (rsutils::rs2_is_gpu_available())
+        {
             return std::make_shared<librealsense::pointcloud_cuda>();
-        #elif defined(__SSSE3__)
+        }
+        #endif
+        #ifdef __SSSE3__
             return std::make_shared<librealsense::pointcloud_sse>();
         #elif defined(__ARM_NEON)  && ! defined ANDROID
             return std::make_shared<librealsense::pointcloud_neon>();

diff --git a/src/proc/y12i-to-y16y16-mipi.cpp b/src/proc/y12i-to-y16y16-mipi.cpp
@@ -5,6 +5,7 @@
 #include "stream.h"
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
+#include "rsutils/accelerators/gpu.h"
 #endif
 
 namespace librealsense
@@ -16,12 +17,15 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi *>(source));
-#else
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi*>(source));
+            return;
+        }
+#endif
         split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
             [](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; },  // We want to convert 10-bit data to 16-bit data
             [](const y12i_pixel_mipi& p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
-#endif
     }
 
     y12i_to_y16y16_mipi::y12i_to_y16y16_mipi(int left_idx, int right_idx)

diff --git a/src/proc/y12i-to-y16y16.cpp b/src/proc/y12i-to-y16y16.cpp
@@ -5,6 +5,7 @@
 #include "stream.h"
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
+#include "rsutils/accelerators/gpu.h"
 #endif
 
 namespace librealsense
@@ -15,12 +16,15 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel *>(source));
-#else
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel*>(source));
+            return;
+        }
+#endif
         split_frame(dest, count, reinterpret_cast<const y12i_pixel*>(source),
             [](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; },  // We want to convert 10-bit data to 16-bit data
             [](const y12i_pixel & p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
-#endif
     }
 
     y12i_to_y16y16::y12i_to_y16y16(int left_idx, int right_idx)

diff --git a/src/proc/y8i-to-y8y8.cpp b/src/proc/y8i-to-y8y8.cpp
@@ -8,6 +8,7 @@
 
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
+#include "rsutils/accelerators/gpu.h"
 #endif
 
 namespace librealsense
@@ -17,12 +18,15 @@ namespace librealsense
     {
         auto count = width * height;
 #ifdef RS2_USE_CUDA
-        rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel *>(source));
-#else
+        if (rsutils::rs2_is_gpu_available())
+        {
+            rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel*>(source));
+            return;
+        }
+#endif
         split_frame(dest, count, reinterpret_cast<const y8i_pixel*>(source),
             [](const y8i_pixel & p) -> uint8_t { return p.l; },
             [](const y8i_pixel & p) -> uint8_t { return p.r; });
-#endif
     }
 
     y8i_to_y8y8::y8i_to_y8y8(int left_idx, int right_idx) :

diff --git a/third-party/rsutils/include/rsutils/accelerators/gpu.h b/third-party/rsutils/include/rsutils/accelerators/gpu.h
@@ -0,0 +1,10 @@
+// License: Apache 2.0. See LICENSE file in root directory.
+// Copyright(c) 2025 Intel Corporation. All Rights Reserved.
+
+#pragma once
+
+namespace rsutils {
+
+    bool rs2_is_gpu_available();
+
+}  // namespace rsutils
diff --git a/third-party/rsutils/src/rsutilgpu.cpp b/third-party/rsutils/src/rsutilgpu.cpp
@@ -0,0 +1,45 @@
+// License: Apache 2.0. See LICENSE file in root directory.
+// Copyright(c) 2025 Intel Corporation. All Rights Reserved.
+
+#include "rsutils/accelerators/gpu.h"
+#include <rsutils/easylogging/easyloggingpp.h>
+
+#ifdef RS2_USE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+namespace rsutils {
+
+    class GPUChecker {
+    public:
+        static bool is_gpu_available() {
+            static int gpuDeviceCount = -1;
+#ifdef RS2_USE_CUDA
+
+            if (gpuDeviceCount < 0)
+            {
+                cudaError_t error = cudaGetDeviceCount(&gpuDeviceCount);
+                if (error != cudaSuccess) {
+                    LOG_ERROR("cudaGetDeviceCount failed: " << cudaGetErrorString(error));
+                    gpuDeviceCount = 0; // Set to 0 to avoid repeated error logging
+                }
+                if (gpuDeviceCount <= 0)
+                {
+                    LOG_INFO("Avoid CUDA execution as no NVIDIA GPU found.");
+                }
+                else
+                {
+                    LOG_INFO("Found " << gpuDeviceCount << " NVIDIA GPU.");
+                }
+            }
+#endif
+            return gpuDeviceCount > 0;
+        }
+    };
+
+    bool rs2_is_gpu_available() {
+        return rsutils::GPUChecker::is_gpu_available();
+    }
+
+} // namespace rsutils
+