Merge pull request #25555 from alexlyulkov:al/int8-uint8-dnn-input

Disabled conversion to float of model's input #25555 In dnn 4.x usually any model's input is converted to float32 or float16 (except quantized models). Also mean and scale can be applied. In current dnn 5.x there is the same conversion except int32 and int64 types. I removed this conversion. Here is how the pipeline works now: - if input Mat type is float32, the pipeline applies mean and scale and may convert it to float16. - if input Mat type is not float32, the pipeline preserves the input type and doesn't apply mean and scale There was a conflict in protobuf parser between ONNX importer and tests. In ONNX importer any uint8 weight was handled as quantized weight and x = int8(x_uint8 - 128) conversion was used inside the protobuf parser. ONNX conformance tests used the same protobuf reader, so tests with uint8 inputs couldn't read the input values properly. I've made this conversion optional. These ONNX conformance tests are enabled: - test_add_uint8 - test_div_uint8 - test_mul_uint8 - test_sub_uint8 - test_max_int8 - test_max_uint8 - test_min_int8 - test_min_uint8 - test_mod_mixed_sign_int8 - test_mod_uint8 These tests were removed: - Test_two_inputs.basic (when input is uint8) - setInput.normalization (when input is uint8) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
opencv · May 16, 2024 · 9238eb2 · 9238eb2
1 parent 6af0394
commit 9238eb2
Show file tree

Hide file tree

Showing 22 changed files with 206 additions and 182 deletions.
diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu
@@ -152,6 +152,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
 #endif
  template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t);
+ template void concat<int8_t>(const Stream&, TensorSpan<int8_t>, std::size_t, TensorView<int8_t>, std::size_t);
+ template void concat<uint8_t>(const Stream&, TensorSpan<uint8_t>, std::size_t, TensorView<uint8_t>, std::size_t);
  template void concat<int32_t>(const Stream&, TensorSpan<int32_t>, std::size_t, TensorView<int32_t>, std::size_t);
  template void concat<int64_t>(const Stream&, TensorSpan<int64_t>, std::size_t, TensorView<int64_t>, std::size_t);
 
@@ -277,6 +279,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void concat_with_offsets(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void concat_with_offsets(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void concat_with_offsets(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void concat_with_offsets(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -371,6 +371,26 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
  template void eltwise_max_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
  template void eltwise_min_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
 
+ template void eltwise_mod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_fmod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_sub_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_div_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_prod_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_sum_coeff_2(const Stream&, TensorSpan<int8_t>, int8_t, TensorView<int8_t>, int8_t, TensorView<int8_t>);
+ template void eltwise_sum_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_max_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+ template void eltwise_min_2(const Stream& stream, TensorSpan<int8_t> output, TensorView<int8_t> x, TensorView<int8_t> y);
+
+ template void eltwise_mod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_fmod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_sub_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_div_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_prod_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_sum_coeff_2(const Stream&, TensorSpan<uint8_t>, uint8_t, TensorView<uint8_t>, uint8_t, TensorView<uint8_t>);
+ template void eltwise_sum_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_max_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+ template void eltwise_min_2(const Stream& stream, TensorSpan<uint8_t> output, TensorView<uint8_t> x, TensorView<uint8_t> y);
+
  template void eltwise_mod_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);
  template void eltwise_fmod_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);
  template void eltwise_sub_2(const Stream& stream, TensorSpan<int32_t> output, TensorView<int32_t> x, TensorView<int32_t> y);

diff --git a/modules/dnn/src/cuda/fill_copy.cu b/modules/dnn/src/cuda/fill_copy.cu
@@ -67,6 +67,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void fill(const Stream&, Span<__half>, __half);
 #endif
  template void fill(const Stream&, Span<float>, float);
+ template void fill(const Stream&, Span<int8_t>, int8_t);
+ template void fill(const Stream&, Span<uint8_t>, uint8_t);
  template void fill(const Stream&, Span<int>, int);
  template void fill(const Stream&, Span<int64_t>, int64_t);
 
@@ -95,6 +97,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void copy(const Stream&, Span<__half>, View<__half>);
 #endif
  template void copy(const Stream&, Span<float>, View<float>);
+ template void copy(const Stream&, Span<int8_t>, View<int8_t>);
+ template void copy(const Stream&, Span<uint8_t>, View<uint8_t>);
  template void copy(const Stream&, Span<int32_t>, View<int32_t>);
  template void copy(const Stream&, Span<int64_t>, View<int64_t>);
 

diff --git a/modules/dnn/src/cuda/limits.hpp b/modules/dnn/src/cuda/limits.hpp
@@ -31,6 +31,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
  __device__ static float lowest() { return -FLT_MAX; }
  };
 
+ template <>
+ struct numeric_limits<signed char> {
+ __device__ static signed char min() { return 1; }
+ __device__ static signed char max() { return SCHAR_MAX; }
+ __device__ static signed char lowest() { return SCHAR_MIN; }
+ };
+
+ template <>
+ struct numeric_limits<unsigned char> {
+ __device__ static unsigned char min() { return 1; }
+ __device__ static unsigned char max() { return UCHAR_MAX; }
+ __device__ static unsigned char lowest() { return 0; }
+ };
+
  template <>
  struct numeric_limits<int32_t> {
  __device__ static int32_t min() { return 1; }

diff --git a/modules/dnn/src/cuda/max_unpooling.cu b/modules/dnn/src/cuda/max_unpooling.cu
@@ -257,6 +257,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
  const std::vector<std::size_t>&);
 
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<int8_t>, TensorSpan<int32_t>, TensorView<int8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<int8_t>, TensorSpan<int64_t>, TensorView<int8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<uint8_t>, TensorSpan<int32_t>, TensorView<uint8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_pooling_with_indices(const Stream&,
+ TensorSpan<uint8_t>, TensorSpan<int64_t>, TensorView<uint8_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
  template void max_pooling_with_indices(const Stream&,
  TensorSpan<int32_t>, TensorSpan<int32_t>, TensorView<int32_t>,
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
@@ -365,6 +385,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,
  const std::vector<std::size_t>&);
 
+ template void max_unpooling(const Stream&,
+ TensorSpan<int8_t>, TensorView<int8_t>, TensorView<int32_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<int8_t>, TensorView<int8_t>, TensorView<int64_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<uint8_t>, TensorView<uint8_t>, TensorView<int32_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
+ template void max_unpooling(const Stream&,
+ TensorSpan<uint8_t>, TensorView<uint8_t>, TensorView<int64_t>,
+ const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+ const std::vector<std::size_t>&);
+
  template void max_unpooling(const Stream&,
  TensorSpan<int32_t>, TensorView<int32_t>, TensorView<int32_t>,
  const std::vector<std::size_t>&, const std::vector<std::size_t>&,

diff --git a/modules/dnn/src/cuda/padding.cu b/modules/dnn/src/cuda/padding.cu
@@ -197,6 +197,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
 #endif
  template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+ template void copy_with_reflection101(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+ template void copy_with_reflection101(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
  template void copy_with_reflection101(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
  template void copy_with_reflection101(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
 

diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu
@@ -107,6 +107,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
 
  template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<float>, View<float>, std::size_t, std::size_t);
+ template void transpose(const Stream&, Span<int8_t>, View<int8_t>, std::size_t, std::size_t);
+ template void transpose(const Stream&, Span<uint8_t>, View<uint8_t>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<int32_t>, View<int32_t>, std::size_t, std::size_t);
  template void transpose(const Stream&, Span<int64_t>, View<int64_t>, std::size_t, std::size_t);
 
@@ -286,6 +288,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void permute(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void permute(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void permute(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void permute(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/cuda/slice.cu b/modules/dnn/src/cuda/slice.cu
@@ -199,6 +199,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
  template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
 #endif
  template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+ template void slice(const Stream&, TensorSpan<int8_t>, TensorView<int8_t>, std::vector<std::size_t>);
+ template void slice(const Stream&, TensorSpan<uint8_t>, TensorView<uint8_t>, std::vector<std::size_t>);
  template void slice(const Stream&, TensorSpan<int32_t>, TensorView<int32_t>, std::vector<std::size_t>);
  template void slice(const Stream&, TensorSpan<int64_t>, TensorView<int64_t>, std::vector<std::size_t>);
 

diff --git a/modules/dnn/src/layer_internals.hpp b/modules/dnn/src/layer_internals.hpp
@@ -154,9 +154,10 @@ struct DataLayer : public Layer
  for (int i = 0; i < inputsData.size(); ++i)
  {
  bool isFP16 = outputs[i].depth() == CV_16F;
- if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) {
+ if (inputsData[i].type() != CV_32F)
+ {
  CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), "");
- CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0);
+ CV_CheckTrue(means[i] == Scalar() && scaleFactors[i] == 1.0, "Input mean and scale are supported only for float32 input");
  inputsData[i].copyTo(outputs[i]);
  continue;
  }
@@ -221,9 +222,10 @@ struct DataLayer : public Layer
  for (int i = 0; i < inputsData.size(); ++i)
  {
  bool isFP16 = outputs[i].depth() == CV_16F;
- if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) {
+ if (inputsData[i].type() != CV_32F)
+ {
  CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), "");
- CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0);
+ CV_CheckTrue(means[i] == Scalar() && scaleFactors[i] == 1.0, "Input mean and scale are supported only for float32 input");
  inputsData[i].copyTo(outputs[i]);
  continue;
  }

diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -359,9 +359,7 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
  for (auto input : inputs)
  {
  CV_CheckTypeEQ(inputs[0], input, "All inputs should have equal types");
- if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA)
- CV_CheckType(input, input == CV_32F || input == CV_32S || input == CV_64S, "Unsupported type");
- else if (preferableTarget == DNN_TARGET_OPENCL_FP16)
+ if (preferableTarget == DNN_TARGET_OPENCL_FP16)
  CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, "");
  else
  CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, "");

diff --git a/modules/dnn/src/legacy_backend.cpp b/modules/dnn/src/legacy_backend.cpp
@@ -90,7 +90,7 @@ Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
  CV_Assert(haveCUDA());
 
 #ifdef HAVE_CUDA
- CV_CheckType(m.depth(), m.depth() == CV_32F || m.depth() == CV_32S || m.depth() == CV_64S, "Unsupported type for CUDA");
+ CV_CheckType(m.depth(), m.depth() == CV_32F || m.depth() == CV_8S || m.depth() == CV_8U || m.depth() == CV_32S || m.depth() == CV_64S, "Unsupported type for CUDA");
  CV_Assert(IS_DNN_CUDA_TARGET(targetId));
  switch (m.depth())
  {
@@ -99,6 +99,10 @@ Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
  return CUDABackendWrapperFP16::create(m);
  else
  return CUDABackendWrapperFP32::create(m);
+ case CV_8S:
+ return CUDABackendWrapperINT8::create(m);
+ case CV_8U:
+ return CUDABackendWrapperUINT8::create(m);
  case CV_32S:
  return CUDABackendWrapperINT32::create(m);
  case CV_64S:

diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
@@ -552,7 +552,7 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
  Mat& inp = layers[0].outputBlobs[i];
  CV_Assert(inp.total());
  int type = inp.type();
- if (type != CV_32S && type != CV_64S)
+ if (type == CV_32F)
  {
  type = CV_32F;
  if (preferableBackend == DNN_BACKEND_OPENCV &&
@@ -562,9 +562,6 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
  if (layers[0].dtype == CV_32F)
  layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F);
  }
- if (netWasQuantized && inp.type() == CV_8S) {
- type = CV_8S;
- }
  }
  inputShapes.push_back(shape(inp));
  inputTypes.push_back(type);

diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp
@@ -62,7 +62,7 @@ Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
  {
  CV_Assert(haveCUDA());
 #ifdef HAVE_CUDA
- CV_CheckType(host.depth(), host.depth() == CV_32F || host.depth() == CV_32S || host.depth() == CV_64S, "Unsupported type for CUDA");
+ CV_CheckType(host.depth(), host.depth() == CV_32F || host.depth() == CV_8S || host.depth() == CV_8U || host.depth() == CV_32S || host.depth() == CV_64S, "Unsupported type for CUDA");
  CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
  switch (host.depth())
  {
@@ -71,6 +71,10 @@ Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
  return CUDABackendWrapperFP16::create(baseBuffer, shape);
  else
  return CUDABackendWrapperFP32::create(baseBuffer, shape);
+ case CV_8S:
+ return CUDABackendWrapperINT8::create(baseBuffer, shape);
+ case CV_8U:
+ return CUDABackendWrapperUINT8::create(baseBuffer, shape);
  case CV_32S:
  return CUDABackendWrapperINT32::create(baseBuffer, shape);
  case CV_64S:

diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -1704,7 +1704,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
  simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
 
-Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
+Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto, bool uint8ToInt8)
 {
  if (tensor_proto.raw_data().empty() && tensor_proto.float_data().empty() &&
  tensor_proto.double_data().empty() && tensor_proto.int64_data().empty() &&
@@ -1834,22 +1834,38 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
  Mat(sizes, CV_64SC1, (void*)src).copyTo(blob);
  }
  }
- else if (datatype == opencv_onnx::TensorProto_DataType_INT8 ||
- datatype == opencv_onnx::TensorProto_DataType_UINT8)
+ else if (datatype == opencv_onnx::TensorProto_DataType_INT8)
+ {
+ if (!tensor_proto.int32_data().empty())
+ {
+ const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S);
+ }
+ else
+ {
+ char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
+ Mat(sizes, CV_8S, val).copyTo(blob);
+ }
+ }
+ else if (datatype == opencv_onnx::TensorProto_DataType_UINT8)
  {
  // TODO : Add support for uint8 weights and acitvations. For now, converting uint8 tensors to int8.
- int offset = datatype == opencv_onnx::TensorProto_DataType_INT8 ? 0 : -128;
- int depth = datatype == opencv_onnx::TensorProto_DataType_INT8 ? CV_8S : CV_8U;
 
  if (!tensor_proto.int32_data().empty())
  {
  const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
- Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S, 1.0, offset);
+ if (uint8ToInt8)
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8S, 1, -128); // handle as ONNX quantized weight
+ else
+ Mat(sizes, CV_32SC1, (void*)field.data()).convertTo(blob, CV_8U);
  }
  else
  {
  char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
- Mat(sizes, depth, val).convertTo(blob, CV_8S, 1.0, offset);
+ if (uint8ToInt8)
+ Mat(sizes, CV_8U, val).convertTo(blob, CV_8S, 1, -128); // handle as ONNX quantized weight
+ else
+ Mat(sizes, CV_8U, val).copyTo(blob);
  }
  }
  else