COMPMID-1975: Update depthwise convolution.

Change-Id: Iad58672be35710a7ec2e918653d6d529709387e8 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/898 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2019-03-11 14:03:23 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2019-03-29 09:54:53 +0000
commit: 47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree: 87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
parent: 2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
download: ComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz
1 files changed, 199 insertions, 115 deletions
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index f0fd4cf256..5db94a67c0 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -31,112 +31,78 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
-NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
-      _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
-      _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
+      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
+      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor                   *input,
+                                                       const ITensor             *weights,
+                                                       const ITensor             *biases,
+                                                       ITensor                   *output,
+                                                       const PadStrideInfo       &conv_info,
+                                                       unsigned int               depth_multiplier,
+                                                       const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_UNUSED(act_info);
 
     PixelValue zero_value(0.f);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias     = biases != nullptr;
-    _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
-                                                                                          conv_info,
-                                                                                          input->info()->data_type(),
-                                                                                          depth_multiplier,
-                                                                                          input->info()->data_layout());
-    _are_weights_reshaped = false;
-    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
-    _permute              = _is_optimized == _is_nchw;
-
     // Initialize the intermediate accumulator tensor in case of quantized input
     if(_is_quantized)
     {
         TensorShape accum_shape  = output->info()->tensor_shape();
         DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_optimized && !_is_nchw)
+        if(!_is_nchw)
         {
             permute(accum_shape, PermutationVector(1U, 2U, 0U));
             accum_layout = DataLayout::NCHW;
         }
 
+        _memory_group.manage(&_accumulator);
         _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
         _accumulator.info()->set_data_layout(accum_layout);
         zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
 
-    if(_is_optimized)
+    if(!_is_nchw)
     {
-        ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
-        if(_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NCHW -> NHWC
-            _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-            _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
 
-            // Configure the function to transform the weights tensor from IHW -> HWI
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
+        // Configure the function to transform the weights tensor from HWI -> IHW
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
-            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-            _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
+        // Configure optimized depthwise
+        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
 
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-            _permuted_output.allocator()->allocate();
-        }
-        else
-        {
-            _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
-        }
+        // Configure border handler
+        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
     }
     else
     {
-        if(!_is_nchw)
-        {
-            // Configure the function to transform the input tensor from NHWC -> NCHW
-            _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-            _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-            // Configure the function to transform the weights tensor from HWI -> IHW
-            _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-            _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
-            // Configure optimized depthwise
-            _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
 
-            // Allocate tensors
-            _permuted_input.allocator()->allocate();
-            _permuted_weights.allocator()->allocate();
-        }
-        else
-        {
-            // Configure depthwise convolution kernel
-            _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
-
-            // Configure border handler
-            _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-        }
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
     }
 
     // Configure biases accumulation
@@ -147,32 +113,116 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
         _accumulator.allocator()->allocate();
     }
     else if(_has_bias)
     {
-        _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
     }
 
-    if(!_is_optimized && !_is_nchw)
+    // Permute output
+    if(!_is_nchw)
     {
         // Configure the function to transform the convoluted output to NHWC
         _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
         _permuted_output.allocator()->allocate();
     }
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor             *input,
+                                                         const ITensor             *weights,
+                                                         const ITensor             *biases,
+                                                         ITensor                   *output,
+                                                         const PadStrideInfo       &conv_info,
+                                                         unsigned int               depth_multiplier,
+                                                         const ActivationLayerInfo &act_info)
+{
+    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
+    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
+    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
+    if(!_is_activationlayer_enabled)
+    {
+        act_info_to_use = act_info;
+    }
+
+    if(_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+
+        // Configure optimized depthwise
+        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
+    }
+}
 
-    //Configure Activation Layer
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor       *input,
+                                               const ITensor *weights,
+                                               const ITensor *biases,
+                                               ITensor *output, const PadStrideInfo &conv_info,
+                                               unsigned int               depth_multiplier,
+                                               const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias         = biases != nullptr;
+    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
+                                                                                       weights->info(),
+                                                                                       conv_info,
+                                                                                       depth_multiplier);
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_optimized == _is_nchw;
+    _is_prepared                = false;
     _is_activationlayer_enabled = act_info.enabled();
 
+    // Configure appropriate pipeline
+    if(_is_optimized)
+    {
+        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    }
+    else
+    {
+        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+    }
+
+    // Configure activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
     }
 }
 
-Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo         *input,
+                                                const ITensorInfo         *weights,
+                                                const ITensorInfo         *biases,
+                                                const ITensorInfo         *output,
+                                                const PadStrideInfo       &conv_info,
+                                                unsigned int               depth_multiplier,
+                                                const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
@@ -184,14 +234,20 @@ Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-    TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier))
+    {
+        const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+        TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
 
-    if(is_quantized)
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        }
+    }
+    else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
     }
 
     //Validate Activation Layer
@@ -203,59 +259,86 @@ Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const
     return Status{};
 }
 
-void NEDepthwiseConvolutionLayer3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run_generic()
 {
-    if(_is_first_run && _is_optimized)
-    {
-        _is_first_run = false;
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
-    }
+    // Fill border
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
 
-    // Permute weights
-    if(_permute)
-    {
-        if(!_are_weights_reshaped)
-        {
-            _are_weights_reshaped = true;
-            _permute_weights.run();
-        }
+    // Execute depthwise convolution
+    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
-        _permute_input.run();
+    // Add biases
+    if(_has_bias || _is_quantized)
+    {
+        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
     }
 
-    // Handle input
-    if(!_is_optimized)
+    // Permute output
+    if(!_is_nchw)
     {
-        // Fill border
-        NEScheduler::get().schedule(&_border_handler, Window::DimX);
+        _permute_output.run();
     }
+}
 
-    // Execute depthwise convolution
-    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+void NEDepthwiseConvolutionLayer3x3::run_optimized()
+{
+    // Run assembly function
+    _dwc_optimized_func.run();
 
     // Permute output
-    if(_is_optimized && _is_nchw)
+    if(_is_nchw)
     {
         _permute_output.run();
     }
+}
 
-    // Add biases
-    if(_has_bias || _is_quantized)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
-    }
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+    prepare();
 
-    // Permute output
-    if(!_is_optimized && !_is_nchw)
+    _memory_group.acquire();
+
+    // Permute input
+    if(_permute)
     {
-        _permute_output.run();
+        _permute_input.run();
     }
 
+    _is_optimized ? run_optimized() : run_generic();
+
+    // Run activation
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.run();
     }
+
+    _memory_group.release();
+}
+
+void NEDepthwiseConvolutionLayer3x3::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Permute weights
+        if(_permute)
+        {
+            _permuted_weights.allocator()->allocate();
+            _permute_weights.run();
+            _original_weights->mark_as_unused();
+        }
+
+        // Prepare optimized function
+        if(_is_optimized)
+        {
+            _dwc_optimized_func.prepare();
+            if(!_permuted_weights.is_used())
+            {
+                _permuted_weights.allocator()->free();
+            }
+        }
+
+        _is_prepared = true;
+    }
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
@@ -542,3 +625,4 @@ void NEDepthwiseConvolutionLayer::prepare()
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
author	Georgios Pinitas <georgios.pinitas@arm.com>	2019-03-11 14:03:23 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2019-03-29 09:54:53 +0000
commit	47d39dc615d1dee2482bc84699802165a9778ac8 (patch)
tree	87f2fdb4f4957be7ff1c043be6328e4154cdf9e1 /src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
parent	2d2551ed3934f071eb6a65f5b776301454bc147a (diff)
download	ComputeLibrary-47d39dc615d1dee2482bc84699802165a9778ac8.tar.gz