COMPMID-873: Integrate RSH NEON Depthwise Convolution routine

Change-Id: Ida1e9a836bc518bfe5563e16bf7f92bde5fc13f7 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118472 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2018-01-30 18:13:46 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:46:07 +0000
commit: 4074c995d2a88684fd4a9d1aa36d51de56bb8dab (patch)
tree: 280a15ca10ff88c5eb432be011ccb721660a3349 /src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
parent: c5694afca3f937f8c9b3ec328da9394f11f9af2d (diff)
download: ComputeLibrary-4074c995d2a88684fd4a9d1aa36d51de56bb8dab.tar.gz
1 files changed, 70 insertions, 12 deletions
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 1f3e5d1192..d35e3e6026 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -33,9 +33,11 @@
 
 using namespace arm_compute;
 using namespace arm_compute::misc;
+using namespace arm_compute::misc::shape_calculator;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false)
+    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
 {
 }
 
@@ -48,20 +50,49 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
 
     _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
     _has_bias     = biases != nullptr;
+    _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
+                                                                                          conv_info,
+                                                                                          input->info()->data_type());
+    _are_weights_reshaped = false;
 
-    // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(_is_quantized)
+    if(_is_optimized)
     {
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
-        _accumulator.info()->set_quantization_info(input->info()->quantization_info());
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+
+        // Configure optimized depthwise
+        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _input_nhwc.allocator()->allocate();
+        _weights_hwio.allocator()->allocate();
+        _output_nhwc.allocator()->allocate();
+
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
     }
+    else
+    {
+        // Allocate the intermediate accumulator tensor in case of fixed point input
+        if(_is_quantized)
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+            _accumulator.info()->set_quantization_info(input->info()->quantization_info());
+            zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        }
 
-    // Configure depthwise convolution kernel
-    _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
 
-    // Configure border handler
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    }
 
     // Configure biases accumulation
     if(_has_bias || _is_quantized)
@@ -83,8 +114,35 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    // Permute weights in HWIO format if the optimized kernel will be executedd
+    if(!_are_weights_reshaped && _is_optimized)
+    {
+        _are_weights_reshaped = true;
+        _permute_weights.run();
+    }
+
+    // Handle input
+    if(_is_optimized)
+    {
+        // Permute input to NHWC format execution
+        _permute_input.run();
+    }
+    else
+    {
+        // Fill border in NCHW format execution
+        NEScheduler::get().schedule(&_border_handler, Window::DimX);
+    }
+
+    // Execute depthwise convolution
+    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+
+    // Permute output to ACL's native NCHW format in case of NHWC execution
+    if(_is_optimized)
+    {
+        _permute_output.run();
+    }
+
+    // Add biases
     if(_has_bias || _is_quantized)
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
author	Georgios Pinitas <georgios.pinitas@arm.com>	2018-01-30 18:13:46 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:46:07 +0000
commit	4074c995d2a88684fd4a9d1aa36d51de56bb8dab (patch)
tree	280a15ca10ff88c5eb432be011ccb721660a3349 /src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
parent	c5694afca3f937f8c9b3ec328da9394f11f9af2d (diff)
download	ComputeLibrary-4074c995d2a88684fd4a9d1aa36d51de56bb8dab.tar.gz