opencv · asmorkalov · May 16, 2024 · May 7, 2024 · May 14, 2024 · May 15, 2024
diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu
@@ -152,6 +152,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
 #endif
  template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t);
+ template void concat<int8_t>(const Stream&, TensorSpan<int8_t>, std::size_t, TensorView<int8_t>, std::size_t);
+ template void concat<uint8_t>(const Stream&, TensorSpan<uint8_t>, std::size_t, TensorView<uint8_t>, std::size_t);
  template void concat<int32_t>(const Stream&, TensorSpan<int32_t>, std::size_t, TensorView<int32_t>, std::size_t);
  template void concat<int64_t>(const Stream&, TensorSpan<int64_t>, std::size_t, TensorView<int64_t>, std::size_t);
 
@@ -277,6 +279,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void concat_with_offsets(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void concat_with_offsets(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void concat_with_offsets(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void concat_with_offsets(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -371,6 +371,26 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
  template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
  template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
 
+ template void eltwise_mod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_fmod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_sub_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_div_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_prod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_sum_coeff_2(const Stream&, TensorSpan<int8_t>, int8_t, TensorView<int8_t>, int8_t, TensorView<int8_t>);
+ template void eltwise_sum_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_max_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_min_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+
+ template void eltwise_mod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_fmod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_sub_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_div_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_prod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_sum_coeff_2(const Stream&, TensorSpan<uint8_t>, uint8_t, TensorView<uint8_t>, uint8_t, TensorView<uint8_t>);
+ template void eltwise_sum_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_max_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_min_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+
  template void eltwise_mod_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);
  template void eltwise_fmod_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);
  template void eltwise_sub_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);

diff --git a/modules/dnn/src/cuda/fill_copy.cu b/modules/dnn/src/cuda/fill_copy.cu
@@ -67,6 +67,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void fill(const Stream&, Span<__half>, __half);
 #endif
  template void fill(const Stream&, Span<float>, float);
+ template void fill(const Stream&, Span<int8_t>, int8_t);
+ template void fill(const Stream&, Span<uint8_t>, uint8_t);
  template void fill(const Stream&, Span<int>, int);
  template void fill(const Stream&, Span<int64_t>, int64_t);
 
@@ -95,6 +97,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void copy(const Stream&, Span<__half>, View<__half>);
 #endif
  template void copy(const Stream&, Span<float>, View<float>);
+ template void copy(const Stream&, Span<int8_t>, View<int8_t>);
+ template void copy(const Stream&, Span<uint8_t>, View<uint8_t>);
  template void copy(const Stream&, Span<int32_t>, View<int32_t>);
  template void copy(const Stream&, Span<int64_t>, View<int64_t>);
 

diff --git a/modules/dnn/src/cuda/limits.hpp b/modules/dnn/src/cuda/limits.hpp
@@ -31,6 +31,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
  __device__ static float lowest() { return -FLT_MAX; }
  };
 
+ template <>
+ struct numeric_limits<signed char> {
+ __device__ static signed char min() { return 1; }
+ __device__ static signed char max() { return SCHAR_MAX; }
+ __device__ static signed char lowest() { return SCHAR_MIN; }
+ };
+
+ template <>
+ struct numeric_limits<unsigned char> {
+ __device__ static unsigned char min() { return 1; }
+ __device__ static unsigned char max() { return UCHAR_MAX; }
+ __device__ static unsigned char lowest() { return 0; }
+ };
+
  template <>
  struct numeric_limits<int32_t> {
  __device__ static int32_t min() { return 1; }

diff --git a/modules/dnn/src/cuda/max_unpooling.cu b/modules/dnn/src/cuda/max_unpooling.cu
@@ -257,6 +257,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
  const std::vector<std::size_t>&);
 
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<int8_t>, TensorSpan<int32_t>, TensorView<int8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<int8_t>, TensorSpan<int64_t>, TensorView<int8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<uint8_t>, TensorSpan<int32_t>, TensorView<uint8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<uint8_t>, TensorSpan<int64_t>, TensorView<uint8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
  template void max_pooling_with_indices(const Stream&,
  TensorSpan<int32_t>, TensorSpan<int32_t>, TensorView<int32_t>,
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
@@ -365,6 +385,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
  const std::vector<std::size_t>&);
 
+ template void max_unpooling(const Stream&,
+ TensorSpan<int8_t>, TensorView<int8_t>, TensorView<int32_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<int8_t>, TensorView<int8_t>, TensorView<int64_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<uint8_t>, TensorView<uint8_t>, TensorView<int32_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<uint8_t>, TensorView<uint8_t>, TensorView<int64_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
  template void max_unpooling(const Stream&,
  TensorSpan<int32_t>, TensorView<int32_t>, TensorView<int32_t>,
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,

diff --git a/modules/dnn/src/cuda/padding.cu b/modules/dnn/src/cuda/padding.cu
@@ -197,6 +197,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
 #endif
  template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+ template void copy_with_reflection101(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+ template void copy_with_reflection101(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
  template void copy_with_reflection101(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
  template void copy_with_reflection101(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
 

diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu
@@ -107,6 +107,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
  template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
+ template void transpose(const Stream&, Span<int8_t>, View<int8_t>, std::size_t, std::size_t);
+ template void transpose(const Stream&, Span<uint8_t>, View<uint8_t>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<int32_t>, View<int32_t>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<int64_t>, View<int64_t>, std::size_t, std::size_t);
 
@@ -286,6 +288,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void permute(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void permute(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void permute(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void permute(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu
@@ -199,6 +199,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void slice(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void slice(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void slice(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void slice(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/layer_internals.hpp b/modules/dnn/src/layer_internals.hpp
@@ -154,9 +154,10 @@ struct DataLayer : public Layer
  for (int i = 0; i < inputsData.size(); ++i)
  {
  bool isFP16 = outputs[i].depth() == CV_16F;
- if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) {
+ if (inputsData[i].type() != CV_32F)
+ {
  CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), "");
- CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0);
+ CV_CheckTrue(means[i] == Scalar() && scaleFactors[i] == 1.0, "Input mean and scale are supported only for float32 input");
  inputsData[i].copyTo(outputs[i]);
  continue;
  }
@@ -221,9 +222,10 @@ struct DataLayer : public Layer
  for (int i = 0; i < inputsData.size(); ++i)
  {
  bool isFP16 = outputs[i].depth() == CV_16F;
- if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) {
+ if (inputsData[i].type() != CV_32F)
+ {
  CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), "");
- CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0);
+ CV_CheckTrue(means[i] == Scalar() && scaleFactors[i] == 1.0, "Input mean and scale are supported only for float32 input");
  inputsData[i].copyTo(outputs[i]);
  continue;
  }

diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -359,9 +359,7 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
  for (auto input : inputs)
  {
  CV_CheckTypeEQ(inputs[0], input, "All inputs should have equal types");
- if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA)
- CV_CheckType(input, input == CV_32F || input == CV_32S || input == CV_64S, "Unsupported type");
- else if (preferableTarget == DNN_TARGET_OPENCL_FP16)
+ if (preferableTarget == DNN_TARGET_OPENCL_FP16)
  CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, "");
  else
  CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, "");

diff --git a/modules/dnn/src/legacy_backend.cpp b/modules/dnn/src/legacy_backend.cpp
@@ -90,7 +90,7 @@ Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
  CV_Assert(haveCUDA());
 
 #ifdef HAVE_CUDA
- CV_CheckType(m.depth(), m.depth() == CV_32F || m.depth() == CV_32S || m.depth() == CV_64S, "Unsupported type for CUDA");
+ CV_CheckType(m.depth(), m.depth() == CV_32F || m.depth() == CV_8S || m.depth() == CV_8U || m.depth() == CV_32S || m.depth() == CV_64S, "Unsupported type for CUDA");
  CV_Assert(IS_DNN_CUDA_TARGET(targetId));
  switch (m.depth())
  {
@@ -99,6 +99,10 @@ Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
  return CUDABackendWrapperFP16::create(m);
  else
  return CUDABackendWrapperFP32::create(m);
+ case CV_8S:
+ return CUDABackendWrapperINT8::create(m);
+ case CV_8U:
+ return CUDABackendWrapperUINT8::create(m);
  case CV_32S:
  return CUDABackendWrapperINT32::create(m);
  case CV_64S:

diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
@@ -552,7 +552,7 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
  Mat& inp = layers[0].outputBlobs[i];
  CV_Assert(inp.total());
  int type = inp.type();
- if (type != CV_32S && type != CV_64S)
+ if (type == CV_32F)
  {
  type = CV_32F;
  if (preferableBackend == DNN_BACKEND_OPENCV &&
@@ -562,9 +562,6 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
  if (layers[0].dtype == CV_32F)
  layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F);
  }
- if (netWasQuantized && inp.type() == CV_8S) {
- type = CV_8S;
- }
  }
  inputShapes.push_back(shape(inp));
  inputTypes.push_back(type);

diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp
@@ -62,7 +62,7 @@ Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
  {
  CV_Assert(haveCUDA());
 #ifdef HAVE_CUDA
- CV_CheckType(host.depth(), host.depth() == CV_32F || host.depth() == CV_32S || host.depth() == CV_64S, "Unsupported type for CUDA");
+ CV_CheckType(host.depth(), host.depth() == CV_32F || host.depth() == CV_8S || host.depth() == CV_8U || host.depth() == CV_32S || host.depth() == CV_64S, "Unsupported type for CUDA");
  CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
  switch (host.depth())
  {
@@ -71,6 +71,10 @@ Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
  return CUDABackendWrapperFP16::create(baseBuffer, shape);
  else
  return CUDABackendWrapperFP32::create(baseBuffer, shape);
+ case CV_8S:
+ return CUDABackendWrapperINT8::create(baseBuffer, shape);
+ case CV_8U:
+ return CUDABackendWrapperUINT8::create(baseBuffer, shape);
  case CV_32S:
  return CUDABackendWrapperINT32::create(baseBuffer, shape);
  case CV_64S:

diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -1704,7 +1704,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
  simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
 
-Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
+Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto, bool uint8ToInt8)
 {
  if (tensor_proto.raw_data().empty() && tensor_proto.float_data().empty() &&
  tensor_proto.double_data().empty() && tensor_proto.int64_data().empty() &&
@@ -1834,22 +1834,38 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
  Mat(sizes, CV_64SC1, (void*)src).copyTo(blob);
  }
  }
- else if (datatype == opencv_onnx::TensorProto_DataType_INT8 ||
- datatype == opencv_onnx::TensorProto_DataType_UINT8)
+ else if (datatype == opencv_onnx::TensorProto_DataType_INT8)
+ {
+ if (!tensor_proto.int32_data().empty())
+ {
+ const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S);
+ }
+ else
+ {
+ char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
+ Mat(sizes, CV_8S, val).copyTo(blob);
+ }
+ }
+ else if (datatype == opencv_onnx::TensorProto_DataType_UINT8)
  {
  // TODO : Add support for uint8 weights and acitvations. For now, converting uint8 tensors to int8.
- int offset = datatype == opencv_onnx::TensorProto_DataType_INT8 ? 0 : -128;
- int depth = datatype == opencv_onnx::TensorProto_DataType_INT8 ? CV_8S : CV_8U;
 
  if (!tensor_proto.int32_data().empty())
  {
  const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
- Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S, 1.0, offset);
+ if (uint8ToInt8)
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S, 1, -128); // handle as ONNX quantized weight
+ else
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8U);
  }
  else
  {
  char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
- Mat(sizes, depth, val).convertTo(blob, CV_8S, 1.0, offset);
+ if (uint8ToInt8)
+ Mat(sizes, CV_8U, val).convertTo(blob, CV_8S, 1, -128); // handle as ONNX quantized weight
+ else
+ Mat(sizes, CV_8U, val).copyTo(blob);
  }
  }
  else