From 747ef82c88f9afe14a8b80b6b3b34118353e97f2 Mon Sep 17 00:00:00 2001
From: Matteo Martincigh <matteo.martincigh@arm.com>
Date: Tue, 18 Dec 2018 09:26:39 +0000
Subject: MLCE-77 Depthwise Convolution with depth multiplier > 1 doesn't work

 * Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution
 * Added conversion utilities to permute/reshape the weights as appropriate
   when using CL and Neon backends
 * Updated the reference implementation of the convolution
 * Updated the relevant unit tests accordingly

!android-nn-driver:459

Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18
---
 .../workloads/NeonDepthwiseConvolutionWorkload.cpp | 72 +++++++++++++---------
 1 file changed, 44 insertions(+), 28 deletions(-)

(limited to 'src/backends/neon')
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
index 6cad12cba8..be26359662 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
@@ -8,10 +8,7 @@
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <neon/NeonLayerSupport.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
-
-#include <DataLayoutIndexed.hpp>
-
-using namespace armnnUtils;
+#include <backendsCommon/WorkloadUtils.hpp>
 
 namespace armnn
 {
@@ -19,17 +16,23 @@ namespace armnn
 using namespace armcomputetensorutils;
 
 arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
-    const TensorInfo& output,
-    const DepthwiseConvolution2dDescriptor& descriptor,
-    const TensorInfo& weights,
-    const Optional<TensorInfo>& biases)
+                                                             const TensorInfo& output,
+                                                             const DepthwiseConvolution2dDescriptor& descriptor,
+                                                             const TensorInfo& weights,
+                                                             const Optional<TensorInfo>& biases)
 {
-    const arm_compute::TensorInfo aclInputInfo =
-        BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclOutputInfo =
-        BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclWeightsInfo =
-        BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+    // ArmNN's weight format is [ M, I, H, W ]
+    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout);
+
+    // Convert the weights into the compute library format
+    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
     arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
@@ -42,9 +45,7 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i
         optionalAclBiasesInfo = &aclBiasesInfo;
     }
 
-    const arm_compute::PadStrideInfo aclPadStrideInfo =
-        BuildArmComputePadStrideInfo(descriptor);
-    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+    const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
 
     return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -59,14 +60,21 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     const WorkloadInfo& info)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+    // ArmNN's weight format is [ M, I, H, W ]
+    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
-    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
+    // Allocate a buffer for the swizzling of the weight tensor
+    std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]);
 
-    INeonTensorHandle* inputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
-    INeonTensorHandle* outputTensorHandle =  static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
-    DataLayoutIndexed dataLayoutIndex(m_Data.m_Parameters.m_DataLayout);
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight,
+                                                                   m_Data.m_Parameters.m_DataLayout,
+                                                                   permuteBuffer.get());
+
+    // Convert the weights into the compute library format
+    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+    BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
@@ -84,6 +92,9 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
 
     m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionWorkload", 1, 1);
 
+    INeonTensorHandle* inputTensorHandle  = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
+    INeonTensorHandle* outputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
+
     arm_compute::ITensor& input  = inputTensorHandle->GetTensor();
     arm_compute::ITensor& output = outputTensorHandle->GetTensor();
 
@@ -91,9 +102,11 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    bool use3x3Optimisation = weightInfo.GetShape()[dataLayoutIndex.GetWidthIndex()] == 3 &&
-                              weightInfo.GetShape()[dataLayoutIndex.GetHeightIndex()] == 3;
+    // Get the depth multiplier
+    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
+    // Check for optimisation opportunities.
+    bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3);
     if (use3x3Optimisation)
     {
         m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
@@ -102,7 +115,8 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
     else
     {
@@ -112,12 +126,14 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
 
     BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
 
-    InitializeArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight);
+    ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted);
+    InitializeArmComputeTensorData(*m_KernelTensor, &weightsPermutedHandle);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
-- 
cgit v1.2.1