From 13ec5f0a09e038f12cbe0f3b119a215934b72b42 Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Thu, 2 Jan 2020 12:11:13 +0000
Subject: COMPMID-2800: Add support for QASYMM8_SIGNED in
 NEDepthwiseConvolutionLayer3x3Kernel

Change-Id: Ia5d23ff2c9e59c80ded2fac5ca02704214f0a01a
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2537
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp        | 21 +++++++++++++--------
 .../NEON/kernels/NEDirectConvolutionLayerKernel.cpp | 10 +++++-----
 src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp | 20 ++++++++++----------
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp  |  2 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp      | 14 +++++++++-----
 5 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'src')
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index e47786525e..1dd05d2cf1 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,7 @@ public:
         const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
         const int          output_w        = output->info()->dimension(0);
         const int          output_h        = output->info()->dimension(1);
-        const int          delta_input     = detail::get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          delta_input     = detail::get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
         const unsigned int conv_pad_x      = conv_info.pad_left();
         const unsigned int conv_pad_y      = conv_info.pad_top();
@@ -107,8 +107,8 @@ public:
             {
                 auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
                 auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
-                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); //uint8
-                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);                           //int32
+                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); // uint8/int8
+                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);                           // int32
 
                 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                     in_top += delta_input, in_mid += delta_input, in_low += delta_input,
@@ -116,12 +116,12 @@ public:
                 {
                     if(dilation == Size2D(1U, 1U))
                     {
-                        auto vres = detail::convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
+                        auto vres = detail::convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, stridex, input_offset);
                         detail::store_results<stridex>(p_out, vres);
                     }
                     else
                     {
-                        auto vres = detail::convolve_3x3_dilation<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), input_offset);
+                        auto vres = detail::convolve_3x3_dilation(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), stridex, input_offset);
                         detail::store_results<stridex>(p_out, vres);
                     }
                 }
@@ -156,7 +156,7 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     const DataLayout   data_layout = input->data_layout();
@@ -192,7 +192,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
     // Get convolved dimensions
     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-    const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    const DataType    output_dt    = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt).set_quantization_info(output->quantization_info()));
@@ -209,6 +209,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     switch(input->data_type())
     {
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
             num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -263,6 +264,7 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
     switch(input->info()->data_type())
     {
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::F32:
             _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
             break;
@@ -307,6 +309,9 @@ void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const Threa
         case DataType::QASYMM8:
             convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
             break;
+        case DataType::QASYMM8_SIGNED:
+            convolve_3x3<int8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
+            break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index bcf70b3ad8..4a71c1edea 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -651,7 +651,7 @@ public:
         const int          output_w        = output->info()->dimension(0);
         const int          output_h        = output->info()->dimension(1);
         const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
         const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
         const unsigned int conv_pad_left   = conv_info.pad_left();
@@ -718,7 +718,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
+                            auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex);
                             store_results<stridex>(p_out, vres);
                         }
                     }
@@ -743,7 +743,7 @@ public:
                         for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
                             in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
                         {
-                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
+                            auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, stridex);
                             accumulate_results<stridex>(p_out, vres);
                         }
                     }
@@ -774,7 +774,7 @@ public:
         const int          output_w        = output->info()->dimension(0);
         const int          output_h        = output->info()->dimension(1);
         const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
         const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
         const unsigned int conv_pad_left   = conv_info.pad_left();
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 72632492d7..374005d897 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -196,7 +196,7 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w
 
             auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
 
-            wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row));
+            wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
         },
         in, out);
     }
@@ -352,10 +352,10 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
         },
         in, out);
     }
@@ -467,10 +467,10 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
 
             auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
 
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
+            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
         },
         inb, out);
     }
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 0320002fba..beb024c529 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -39,7 +39,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
                                     unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     if(!is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 142f873ef4..e0094f4eec 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -469,8 +469,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
     }
 
     // Check data type
-    const DataType data_type          = weights->data_type();
-    bool           is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type) || data_type == DataType::QSYMM8_PER_CHANNEL;
+    // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer
+    const DataType input_type            = input->data_type();
+    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
+    const DataType weights_type          = weights->data_type();
+    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
+                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
 
     // Check weighs size
     std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
@@ -496,12 +500,12 @@ bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITenso
     // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
     bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
 
-    if(data_type == DataType::QSYMM8_PER_CHANNEL)
+    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
     {
         is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
     }
 
-    return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
+    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
 }
 
 void NEDepthwiseConvolutionAssemblyDispatch::run()
-- 
cgit v1.2.1