Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to support CUDA build w/o GPU in runtime #13674

29 changes: 17 additions & 12 deletions src/linux/backend-hid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "types.h"

#include <rsutils/string/from.h>
#include "rsutils/rsutilgpu.h"

#include <thread>
#include <chrono>
Expand Down Expand Up @@ -1232,21 +1233,25 @@ namespace librealsense
for (auto& elem : common_sensors)
{
hid_device_info hid_dev_info{};
if(!get_hid_device_info(elem.c_str(), hid_dev_info))
if (!get_hid_device_info(elem.c_str(), hid_dev_info))
{
#ifdef RS2_USE_CUDA
/* On the Jetson TX, ina3221x is the power monitor (I2C bus)
This code is checking the IIA device directory, but tries to compare as USB HID device
The ina3221x is not a HID device. Check here to avoid spamming the console.
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
std::string device_path_str(elem.c_str());
device_path_str+="/";
std::string dev_name;
std::ifstream(device_path_str + "name") >> dev_name;
if (dev_name != std::string("ina3221x")) {
LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
if (rsutils::rs2_is_gpu_available())
{
/* On the Jetson TX, ina3221x is the power monitor (I2C bus)
This code is checking the IIA device directory, but tries to compare as USB HID device
The ina3221x is not a HID device. Check here to avoid spamming the console.
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
std::string device_path_str(elem.c_str());
device_path_str += "/";
std::string dev_name;
std::ifstream(device_path_str + "name") >> dev_name;
if (dev_name != std::string("ina3221x")) {
LOG_WARNING("Failed to read busnum/devnum. Device Path: " << elem);
}
}
#else
#endif
#ifndef RS2_USE_CUDA
LOG_INFO("Failed to read busnum/devnum. Device Path: " << elem);
#endif
continue;
Expand Down
11 changes: 8 additions & 3 deletions src/linux/backend-v4l2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

#include <sys/signalfd.h>
#include <signal.h>
#include "rsutils/rsutilgpu.h"

#pragma GCC diagnostic ignored "-Woverflow"

const size_t MAX_DEV_PARENT_DIR = 10;
Expand Down Expand Up @@ -755,9 +757,12 @@ namespace librealsense
if (!is_usb_path_valid(video_path, dev_name, busnum, devnum, devpath))
{
#ifndef RS2_USE_CUDA
/* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
if (rsutils::rs2_is_gpu_available())
{
/* On the Jetson TX, the camera module is CSI & I2C and does not report as this code expects
Patch suggested by JetsonHacks: https://github.com/jetsonhacks/buildLibrealsense2TX */
LOG_INFO("Failed to read busnum/devnum. Device Path: " << ("/sys/class/video4linux/" + name));
}
#endif
throw linux_backend_exception("Failed to read busnum/devnum of usb device");
}
Expand Down
24 changes: 15 additions & 9 deletions src/proc/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

#if defined(RS2_USE_CUDA)
#include "proc/cuda/cuda-align.h"
#elif defined(__SSSE3__)
#include "rsutils/rsutilgpu.h"
#endif
#if defined(__SSSE3__)
#include "proc/sse/sse-align.h"
#endif
#include "proc/neon/neon-align.h"
Expand All @@ -24,15 +26,19 @@ namespace librealsense

std::shared_ptr<align> align::create_align(rs2_stream align_to)
{
#if defined(RS2_USE_CUDA)
#if defined(RS2_USE_CUDA)
if (rsutils::rs2_is_gpu_available())
{
return std::make_shared<librealsense::align_cuda>(align_to);
#elif defined(__SSSE3__)
return std::make_shared<librealsense::align_sse>(align_to);
#elif defined(__ARM_NEON) && ! defined(ANDROID)
return std::make_shared<librealsense::align_neon>(align_to);
#else
return std::make_shared<librealsense::align>(align_to);
#endif
}
#endif
#if defined(__SSSE3__)
return std::make_shared<librealsense::align_sse>(align_to);
#elif defined(__ARM_NEON) && ! defined(ANDROID)
return std::make_shared<librealsense::align_neon>(align_to);
#else
return std::make_shared<librealsense::align>(align_to);
#endif
}

template<class GET_DEPTH, class TRANSFER_PIXEL>
Expand Down
13 changes: 9 additions & 4 deletions src/proc/color-formats-converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/rsutilgpu.h"
#endif
#ifdef __SSSE3__
#include <tmmintrin.h> // For SSSE3 intrinsics
#endif
#include "neon/image-neon.h"


#if defined (ANDROID) || (defined (__linux__) && !defined (__x86_64__)) || (defined (__APPLE__) && !defined (__x86_64__))

bool has_avx() { return false; }
Expand All @@ -45,20 +47,23 @@ bool has_avx()

#endif

namespace librealsense
namespace librealsense
{
/////////////////////////////
// YUY2 unpacking routines //
/////////////////////////////
// This templated function unpacks YUY2 into Y8/Y16/RGB8/RGBA8/BGR8/BGRA8, depending on the compile-time parameter FORMAT.
// It is expected that all branching outside of the loop control variable will be removed due to constant-folding.
template<rs2_format FORMAT> void unpack_yuy2( uint8_t * const d[], const uint8_t * s, int width, int height, int actual_size)
template<rs2_format FORMAT> void unpack_yuy2(uint8_t* const d[], const uint8_t* s, int width, int height, int actual_size)
{
auto n = width * height;
assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
#ifdef RS2_USE_CUDA
rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
return;
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
return;
}
#endif
#if defined __SSSE3__ && ! defined ANDROID
static bool do_avx = has_avx();
Expand Down
31 changes: 20 additions & 11 deletions src/proc/depth-formats-converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,45 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/rsutilgpu.h"
#endif

namespace librealsense
{
void unpack_z16_y8_from_sr300_inzi( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size)
void unpack_z16_y8_from_sr300_inzi(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size)
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
{
auto count = width * height;
auto in = reinterpret_cast<const uint16_t*>(source);
auto out_ir = reinterpret_cast<uint8_t *>(dest[1]);
auto out_ir = reinterpret_cast<uint8_t*>(dest[1]);
#ifdef RS2_USE_CUDA
rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_z16_y8_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
}
#endif
#ifndef RS2_USE_CUDA
for (int i = 0; i < count; ++i) *out_ir++ = *in++ >> 2;
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
#endif
std::memcpy( dest[0], in, count * 2 );
std::memcpy(dest[0], in, count * 2);
}

void unpack_z16_y16_from_sr300_inzi( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size)
void unpack_z16_y16_from_sr300_inzi(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size)
{
auto count = width * height;
auto in = reinterpret_cast<const uint16_t*>(source);
auto out_ir = reinterpret_cast<uint16_t*>(dest[1]);
#ifdef RS2_USE_CUDA
rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::unpack_z16_y16_from_sr300_inzi_cuda(out_ir, in, count);
in += count;
}
#endif
#ifndef RS2_USE_CUDA
for (int i = 0; i < count; ++i) *out_ir++ = *in++ << 6;
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
#endif
std::memcpy( dest[0], in, count * 2 );
std::memcpy(dest[0], in, count * 2);
}

void unpack_inzi(rs2_format dst_ir_format, uint8_t * const d[], const uint8_t * s, int width, int height, int actual_size)
Expand Down
21 changes: 13 additions & 8 deletions src/proc/pointcloud.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "proc/sse/sse-pointcloud.h"
#endif
#include "proc/neon/neon-pointcloud.h"
#include "rsutils/rsutilgpu.h"
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved


namespace librealsense
Expand Down Expand Up @@ -395,15 +396,19 @@ namespace librealsense

std::shared_ptr<pointcloud> pointcloud::create()
{
#ifdef RS2_USE_CUDA
#ifdef RS2_USE_CUDA
if (rsutils::rs2_is_gpu_available())
{
return std::make_shared<librealsense::pointcloud_cuda>();
#elif defined(__SSSE3__)
return std::make_shared<librealsense::pointcloud_sse>();
#elif defined(__ARM_NEON) && ! defined ANDROID
return std::make_shared<librealsense::pointcloud_neon>();
#else
return std::make_shared<librealsense::pointcloud>();
#endif
}
#endif
#ifdef __SSSE3__
return std::make_shared<librealsense::pointcloud_sse>();
#elif defined(__ARM_NEON) && ! defined ANDROID
return std::make_shared<librealsense::pointcloud_neon>();
#else
return std::make_shared<librealsense::pointcloud>();
#endif
}

bool pointcloud::run__occlusion_filter(const rs2_extrinsics& extr)
Expand Down
19 changes: 12 additions & 7 deletions src/proc/y12i-to-y16y16-mipi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,24 @@
#include "stream.h"
#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/rsutilgpu.h"
#endif

namespace librealsense
{
//D457 dev - padding of 8 bits added after each bits, should be removed after it is corrected in SerDes
//D457 dev - padding of 8 bits added after each bits, should be removed after it is corrected in SerDes
struct y12i_pixel_mipi { uint8_t rl : 8, rh : 4, ll : 4, lh : 8, padding : 8; int l() const { return lh << 4 | ll; } int r() const { return rh << 8 | rl; } };

void unpack_y16_y16_from_y12i_10_mipi( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size)
void unpack_y16_y16_from_y12i_10_mipi(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size)
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel_mipi*>(source));
}
#endif
#ifndef RS2_USE_CUDA
split_frame(dest, count, reinterpret_cast<const y12i_pixel_mipi*>(source),
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
[](const y12i_pixel_mipi& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
[](const y12i_pixel_mipi& p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
Expand All @@ -27,12 +32,12 @@ namespace librealsense
y12i_to_y16y16_mipi::y12i_to_y16y16_mipi(int left_idx, int right_idx)
: y12i_to_y16y16_mipi("Y12I to Y16L Y16R Transform", left_idx, right_idx) {}

y12i_to_y16y16_mipi::y12i_to_y16y16_mipi(const char * name, int left_idx, int right_idx)
y12i_to_y16y16_mipi::y12i_to_y16y16_mipi(const char* name, int left_idx, int right_idx)
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
: interleaved_functional_processing_block(name, RS2_FORMAT_Y12I, RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 1,
RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 2)
RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 2)
{}

void y12i_to_y16y16_mipi::process_function( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size, int input_size)
void y12i_to_y16y16_mipi::process_function(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size, int input_size)
{
unpack_y16_y16_from_y12i_10_mipi(dest, source, width, height, actual_size);
}
Expand Down
21 changes: 13 additions & 8 deletions src/proc/y12i-to-y16y16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,38 @@
#include "stream.h"
#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/rsutilgpu.h"
#endif

namespace librealsense
{
struct y12i_pixel { uint8_t rl : 8, rh : 4, ll : 4, lh : 8; int l() const { return lh << 4 | ll; } int r() const { return rh << 8 | rl; } };

void unpack_y16_y16_from_y12i_10( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size)
void unpack_y16_y16_from_y12i_10(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size)
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y16_y16_from_y12i_cuda(dest, count, reinterpret_cast<const rscuda::y12i_pixel*>(source));
}
#endif
#ifndef RS2_USE_CUDA
split_frame(dest, count, reinterpret_cast<const y12i_pixel*>(source),
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
[](const y12i_pixel & p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
[](const y12i_pixel & p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
[](const y12i_pixel& p) -> uint16_t { return p.l() << 6 | p.l() >> 4; }, // We want to convert 10-bit data to 16-bit data
[](const y12i_pixel& p) -> uint16_t { return p.r() << 6 | p.r() >> 4; }); // Multiply by 64 1/16 to efficiently approximate 65535/1023
#endif
}

y12i_to_y16y16::y12i_to_y16y16(int left_idx, int right_idx)
: y12i_to_y16y16("Y12I to Y16L Y16R Transform", left_idx, right_idx) {}

y12i_to_y16y16::y12i_to_y16y16(const char * name, int left_idx, int right_idx)
y12i_to_y16y16::y12i_to_y16y16(const char* name, int left_idx, int right_idx)
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
: interleaved_functional_processing_block(name, RS2_FORMAT_Y12I, RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 1,
RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 2)
RS2_FORMAT_Y16, RS2_STREAM_INFRARED, RS2_EXTENSION_VIDEO_FRAME, 2)
{}

void y12i_to_y16y16::process_function( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size, int input_size)
void y12i_to_y16y16::process_function(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size, int input_size)
{
unpack_y16_y16_from_y12i_10(dest, source, width, height, actual_size);
}
Expand Down
11 changes: 8 additions & 3 deletions src/proc/y8i-to-y8y8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@

#ifdef RS2_USE_CUDA
#include "cuda/cuda-conversion.cuh"
#include "rsutils/rsutilgpu.h"
#endif

namespace librealsense
{
struct y8i_pixel { uint8_t l, r; };
void unpack_y8_y8_from_y8i( uint8_t * const dest[], const uint8_t * source, int width, int height, int actual_size)
void unpack_y8_y8_from_y8i(uint8_t* const dest[], const uint8_t* source, int width, int height, int actual_size)
{
auto count = width * height;
#ifdef RS2_USE_CUDA
rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel *>(source));
#else
if (rsutils::rs2_is_gpu_available())
{
rscuda::split_frame_y8_y8_from_y8i_cuda(dest, count, reinterpret_cast<const y8i_pixel*>(source));
}
#endif
#ifndef RS2_USE_CUDA
split_frame(dest, count, reinterpret_cast<const y8i_pixel*>(source),
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
[](const y8i_pixel & p) -> uint8_t { return p.l; },
[](const y8i_pixel & p) -> uint8_t { return p.r; });
Expand Down
18 changes: 18 additions & 0 deletions third-party/rsutils/include/rsutils/rsutilgpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2023 Intel Corporation. All Rights Reserved.
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved

#pragma once

#ifdef __cplusplus
extern "C" {
Nir-Az marked this conversation as resolved.
Show resolved Hide resolved
#endif

namespace rsutils {

bool rs2_is_gpu_available();

} // namespace rsutils

#ifdef __cplusplus
}
#endif
Loading
Loading