From a799ce0ad775829862891dd98d1232638ec8761e Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 12 Sep 2018 20:11:34 +0100
Subject: COMPMID-1564: Add NEDepthwiseConvolution3x3 for QASYMM8

Change-Id: I1f55508af6f220e5f41df7b56daffb4761ed0591
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148253
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
---
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp       |  37 ++-
 .../NEDirectConvolutionLayerOutputStageKernel.cpp  | 104 ++++++-
 .../depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp  |   2 +-
 .../depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp  |   2 +-
 .../depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp  |   2 +-
 .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp  |   2 +-
 .../depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp     | 128 +++++++++
 .../depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp     | 166 +++++++++++
 .../convolution/depthwise/impl_fp16_fp16.hpp       |   8 +-
 .../convolution/depthwise/impl_fp32_fp32.hpp       |   8 +-
 .../kernels/convolution/depthwise/impl_u8_s32.hpp  | 315 +++++++++++++++++++++
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  40 +--
 12 files changed, 774 insertions(+), 40 deletions(-)
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp

(limited to 'src')

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 7029b06615..99bdb7a70e 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -198,8 +198,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
             output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
             output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
 
+            const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
             // Output auto inizialitation if not yet initialized
-            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
 
             // Configure window (optimised)
             // Set padding in channels
@@ -324,7 +326,7 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso
     }
 
     // Check supported data type
-    bool supported_datatype = is_data_type_float(dt);
+    bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
 
     // Check for supported strides
     const auto &strides           = conv_info.stride();
@@ -345,11 +347,15 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso
 
 void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
 
     _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
+    if(_convolver)
+    {
+        _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
+    }
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -433,6 +439,31 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3
     const auto stride_x = conv_info.stride().first;
     switch(dt)
     {
+        case DataType::QASYMM8:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+                               in->ptr_to_element(Coordinates()),
+                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+                               in->ptr_to_element(Coordinates()),
+                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                default:
+                    return nullptr;
+            }
+            break;
+        }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 864c63f731..a571d54501 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -194,8 +194,8 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -304,14 +304,14 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
                 internal_vst1q(out_ptr, internal_vld1q(in_ptr));
             }
         },
-        in, bi);
+        in, bi, out);
     }
 }
 
 // QASYMM8 specializations
 template <>
-void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                                 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                      int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
     uint8x16_t      min                           = vdupq_n_u8(0);
@@ -352,8 +352,8 @@ void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *
     in, out);
 }
 template <>
-void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     ARM_COMPUTE_UNUSED(bias);
 
@@ -382,6 +382,85 @@ void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor
     },
     in, out);
 }
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                      int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    Iterator out(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto in_ptr   = reinterpret_cast<int32_t *>(in.ptr());
+        const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+
+        // Accumulate bias
+        int32x4x4_t v_in =
+        {
+            {
+                vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
+                vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
+                vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
+                vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
+            }
+        };
+
+        const auto out_ptr = out.ptr();
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, bi, out);
+}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(bias);
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    Iterator out(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+
+        // Accumulate bias
+        int32x4x4_t v_in =
+        {
+            {
+                vld1q_s32(in_ptr),
+                vld1q_s32(in_ptr + 4),
+                vld1q_s32(in_ptr + 8),
+                vld1q_s32(in_ptr + 12)
+            }
+        };
+
+        const auto out_ptr = out.ptr();
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, bi, out);
+}
 } // namespace
 
 NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
@@ -426,19 +505,19 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
         {
             case DataType::S32:
             {
-                _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+                _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+                _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+                _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
                 break;
             }
             default:
@@ -451,6 +530,11 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
     {
         switch(input->info()->data_type())
         {
+            case DataType::S32:
+            {
+                _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+                break;
+            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index c5a056560b..ca1de26ed7 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 0c96bebc02..21e8f04774 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 941c8e9248..c7113d05b3 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index 1cbd6d5623..c36c24ec0f 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
new file mode 100644
index 0000000000..8f22a64ea6
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
new file mode 100644
index 0000000000..cf515504c7
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index ed4cfb86b9..dacfb24c89 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, Kern
     const int in_pad_bottom=0,
     const int in_pad_right=0,
     const int out_pad_bottom=0,
-    const int out_pad_right=0
+    const int out_pad_right=0,
+    const int input_offset=0,
+    const int weights_offset=0
   );
 };
 
@@ -102,7 +104,9 @@ void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::p
   const int _in_pad_bottom,
   const int _in_pad_right,
   const int _out_pad_bottom,
-  const int _out_pad_right
+  const int _out_pad_right,
+  const int _input_offset,
+  const int _weights_offset
 )
 {
   constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 7a216ed518..840086f917 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, Kern
     const int in_pad_bottom=0,
     const int in_pad_right=0,
     const int out_pad_bottom=0,
-    const int out_pad_right=0
+    const int out_pad_right=0,
+    const int input_offset=0,
+    const int weights_offset=0
   );
 };
 
@@ -102,7 +104,9 @@ void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_t
   const int _in_pad_bottom,
   const int _in_pad_right,
   const int _out_pad_bottom,
-  const int _out_pad_right
+  const int _out_pad_right,
+  const int _input_offset,
+  const int _weights_offset
 )
 {
   constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
new file mode 100644
index 0000000000..d0d8de538d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for U8 to S32
+template <int OutputTileRows, int OutputTileCols,
+        int KernelRows, int KernelCols,
+        int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
+{
+    typedef DepthwiseConvolution<
+            OutputTileRows, OutputTileCols,
+            KernelRows, KernelCols,
+            StrideRows, StrideCols,
+            uint8_t, int32_t
+    > DWC;
+
+    template <
+            bool Specialize=false,  // Specialize (or not) the method
+            int InPadTop=0,         // If specialized, top padding
+            int InPadLeft=0,        // If specialized, left padding
+            int InPadBottom=0,      // If specialized, bottom padding
+            int InPadRight=0,       // If specialized, right padding
+            int OutPadBottom=0,     // If specialized, bottom output padding
+            int OutPadRight=0       // If specialized, bottom right padding
+    >
+    static void process_tile(
+            const int n_channels,
+            const uint8_t* const weights,
+            const int weight_row_stride,
+            const int weight_col_stride,
+            const uint8_t* const inptr,
+            const int in_row_stride,
+            const int in_col_stride,
+            int32_t* const outptr,
+            const int out_row_stride,
+            const int out_col_stride,
+            const int in_pad_top=0,
+            const int in_pad_left=0,
+            const int in_pad_bottom=0,
+            const int in_pad_right=0,
+            const int out_pad_bottom=0,
+            const int out_pad_right=0,
+            const int input_offset=0,
+            const int weights_offset=0);
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+        bool Specialize,
+        int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+        int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
+        const int n_channels,
+        const uint8_t *__restrict__ const weights,
+        const int weight_row_stride,
+        const int weight_col_stride,
+        const uint8_t *__restrict__ const inptr,
+        const int in_row_stride,
+        const int in_col_stride,
+        int32_t *__restrict__ const outptr,
+        const int out_row_stride,
+        const int out_col_stride,
+        const int _in_pad_top,
+        const int _in_pad_left,
+        const int _in_pad_bottom,
+        const int _in_pad_right,
+        const int _out_pad_bottom,
+        const int _out_pad_right,
+        const int _input_offset,
+        const int _weights_offset
+)
+{
+    constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+    constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+    constexpr auto kernel_rows = DWC::kernel_rows;
+    constexpr auto kernel_cols = DWC::kernel_cols;
+    constexpr auto output_tile_rows = DWC::output_tile_rows;
+    constexpr auto output_tile_cols = DWC::output_tile_cols;
+    constexpr auto stride_rows = DWC::stride_rows;
+    constexpr auto stride_cols = DWC::stride_cols;
+
+    // Extract parameters
+    const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+    const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+    const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+    const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+    const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+    const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+    // Compute valid ranges of the tile
+    const int in_cells_i = inner_tile_rows - in_pad_bottom;
+    const int in_cells_j = inner_tile_cols - in_pad_right;
+    const int out_cells_i = output_tile_rows - out_pad_bottom;
+    const int out_cells_j = output_tile_cols - out_pad_right;
+
+    // Instantiate pointers
+    const uint8_t* __restrict__ inptr_base = inptr;
+    const uint8_t* __restrict__ wptr_base = weights;
+    int32_t* __restrict__ outptr_base = outptr;
+
+    // Perform the depthwise convolution
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
+    const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
+    for (; channels_remaining >= 16; channels_remaining -= 16)
+    {
+        // Load input tile
+        int32x4x4_t u[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                if (i < in_pad_top || in_cells_i <= i ||
+                    j < in_pad_left || in_cells_j <= j)
+                {
+                    u[i][j].val[0] = vdupq_n_s32(0);
+                    u[i][j].val[1] = vdupq_n_s32(0);
+                    u[i][j].val[2] = vdupq_n_s32(0);
+                    u[i][j].val[3] = vdupq_n_s32(0);
+                }
+                else
+                {
+                    const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
+                    u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
+                    u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
+                    u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
+                    u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
+                }
+            }
+        }
+        inptr_base += 16;
+
+        // Load weights tile
+        int32x4x4_t w[kernel_rows][kernel_cols];
+        for (int i = 0; i < kernel_rows; i++)
+        {
+            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+            for (int j = 0; j < kernel_cols; j++)
+            {
+                const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
+                w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
+                w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
+                w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
+                w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
+            }
+        }
+        wptr_base += 16;
+
+        // Perform the convolution
+        int32x4x4_t v[output_tile_rows][output_tile_cols];
+        for (int out_i = 0; out_i < out_cells_i; out_i++)
+        {
+            for (int out_j = 0; out_j < out_cells_j; out_j++)
+            {
+                // Base co-ordinate
+                const int base_i = out_i * stride_rows;
+                const int base_j = out_j * stride_cols;
+
+                // Fill the accumulator
+                for (int in_i = 0; in_i < kernel_rows; in_i++)
+                {
+                    const int i = base_i + in_i;
+                    for (int in_j = 0; in_j < kernel_cols; in_j++)
+                    {
+                        const int j = base_j + in_j;
+                        if (in_i == 0 && in_j == 0)
+                        {
+                            // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+                            v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
+                            v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
+                            v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
+                            v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
+                        }
+                        else
+                        {
+                            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+                            v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
+                            v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
+                            v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
+                            v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Store the output tile
+        for (int i = 0; i < out_cells_i; i++)
+        {
+            int32_t* const outptr_row = outptr_base + i*out_row_stride;
+            for (int j = 0; j < out_cells_j; j++)
+            {
+                vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
+                vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
+                vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
+                vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
+            }
+        }
+        outptr_base += 16;
+    }
+#endif  // __aarch64__
+    for (; channels_remaining; channels_remaining--)
+    {
+        // Load input tile
+        int32_t u[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                if (i < in_pad_top || in_cells_i <= i ||
+                    j < in_pad_left || in_cells_j <= j)
+                {
+                    u[i][j] = static_cast<uint8_t>(0);
+                }
+                else
+                {
+                    u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
+                }
+            }
+        }
+        inptr_base++;
+
+        // Load weights tile
+        int32_t w[kernel_rows][kernel_cols];
+        for (int i = 0; i < kernel_rows; i++)
+        {
+            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+            for (int j = 0; j < kernel_cols; j++)
+            {
+                w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
+            }
+        }
+        wptr_base++;
+
+        // Perform the convolution
+        int32_t v[output_tile_rows][output_tile_cols];
+        for (int out_i = 0; out_i < out_cells_i; out_i++)
+        {
+            for (int out_j = 0; out_j < out_cells_j; out_j++)
+            {
+                // Clear the accumulator
+                v[out_i][out_j] = static_cast<int32_t>(0);
+
+                // Base co-ordinate
+                const int base_i = out_i * stride_rows;
+                const int base_j = out_j * stride_cols;
+
+                // Fill the accumulator
+                for (int in_i = 0; in_i < kernel_rows; in_i++)
+                {
+                    const int i = base_i + in_i;
+                    for (int in_j = 0; in_j < kernel_cols; in_j++)
+                    {
+                        const int j = base_j + in_j;
+                        v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+                    }
+                }
+            }
+        }
+
+        // Store the output tile
+        for (int i = 0; i < out_cells_i; i++)
+        {
+            int32_t* const outptr_row = outptr_base + i*out_row_stride;
+            for (int j = 0; j < out_cells_j; j++)
+            {
+                *(outptr_row + j*out_col_stride) = v[i][j];
+            }
+        }
+        outptr_base++;
+    }
+}
+
+}  // namespace depthwise
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a46be2ec92..9dcbc99332 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -59,8 +59,25 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
     _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
     _permute              = _is_optimized == _is_nchw;
 
+    // Initialize the intermediate accumulator tensor in case of quantized input
+    if(_is_quantized)
+    {
+        TensorShape accum_shape  = output->info()->tensor_shape();
+        DataLayout  accum_layout = output->info()->data_layout();
+        if(!_is_optimized && !_is_nchw)
+        {
+            permute(accum_shape, PermutationVector(1U, 2U, 0U));
+            accum_layout = DataLayout::NCHW;
+        }
+
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+        _accumulator.info()->set_data_layout(accum_layout);
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+    }
+
     if(_is_optimized)
     {
+        ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
         if(_is_nchw)
         {
             // Configure the function to transform the input tensor from NCHW -> NHWC
@@ -75,8 +92,8 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
             _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
 
             // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-            _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+            _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+            _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
 
             // Allocate tensors
             _permuted_input.allocator()->allocate();
@@ -85,26 +102,11 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
         }
         else
         {
-            _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+            _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
         }
     }
     else
     {
-        // Allocate the intermediate accumulator tensor in case of quantized input
-        if(_is_quantized)
-        {
-            TensorShape accum_shape = output->info()->tensor_shape();
-
-            if(!_is_nchw)
-            {
-                permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            }
-
-            _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
-            _accumulator.info()->set_quantization_info(input->info()->quantization_info());
-            zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
-        }
-
         if(!_is_nchw)
         {
             // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -143,7 +145,7 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
         _accumulator.allocator()->allocate();
     }
     else if(_has_bias)
-- 
cgit v1.2.1