MLCE-77 Depthwise Convolution with depth multiplier > 1 doesn't work

* Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution * Added conversion utilities to permute/reshape the weights as appropriate when using CL and Neon backends * Updated the reference implementation of the convolution * Updated the relevant unit tests accordingly !android-nn-driver:459 Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18
author: Matteo Martincigh <matteo.martincigh@arm.com> 2018-12-18 09:26:39 +0000
committer: Matteo Martincigh <matteo.martincigh@arm.com> 2019-01-04 17:28:07 +0000
commit: 747ef82c88f9afe14a8b80b6b3b34118353e97f2 (patch)
tree: a29ac33b84fb96a41103a0a97327189495374cc9 /src/backends
parent: 760892724d131c7da4b9baad05cddd49276ad6bb (diff)
download: armnn-747ef82c88f9afe14a8b80b6b3b34118353e97f2.tar.gz
18 files changed, 332 insertions, 209 deletions
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
index a2d7d8c797..32af42f7e1 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
@@ -109,19 +109,6 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso
     return arm_compute::TensorInfo(aclTensorShape, 1, aclDataType, aclQuantizationInfo);
 }
 
-arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout)
-{
-    switch(dataLayout)
-    {
-        case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC;
-
-        case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW;
-
-        default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" +
-                                                std::to_string(static_cast<int>(dataLayout)) + "]");
-    }
-}
-
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo,
                                                   armnn::DataLayout dataLayout)
 {
@@ -136,6 +123,19 @@ arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tenso
     return clTensorInfo;
 }
 
+arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout)
+{
+    switch(dataLayout)
+    {
+        case armnn::DataLayout::NHWC : return arm_compute::DataLayout::NHWC;
+
+        case armnn::DataLayout::NCHW : return arm_compute::DataLayout::NCHW;
+
+        default: throw InvalidArgumentException("Unknown armnn::DataLayout: [" +
+                                                std::to_string(static_cast<int>(dataLayout)) + "]");
+    }
+}
+
 arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor)
 {
     using arm_compute::PoolingType;
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.hpp b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
index fbd850c687..fa455b746b 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
@@ -36,16 +36,16 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te
 /// armnn::ITensorInfo.
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo);
 
-/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout
-/// armnn::DataLayout.
-arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout);
-
 /// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given
 /// armnn::ITensorInfo.
 /// armnn::DataLayout.
 arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo,
                                                   armnn::DataLayout dataLayout);
 
+/// Utility function used to convert armnn::DataLayout to arm_compute::DataLayout
+/// armnn::DataLayout.
+arm_compute::DataLayout ConvertDataLayout(armnn::DataLayout dataLayout);
+
 /// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor.
 arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor);
 
diff --git a/src/backends/backendsCommon/CMakeLists.txt b/src/backends/backendsCommon/CMakeLists.txt
index f29563093c..b120f51184 100644
--- a/src/backends/backendsCommon/CMakeLists.txt
+++ b/src/backends/backendsCommon/CMakeLists.txt
@@ -27,6 +27,7 @@ list(APPEND armnnBackendsCommon_sources
     WorkloadFactory.hpp
     Workload.hpp
     WorkloadInfo.hpp
+    WorkloadUtils.cpp
     WorkloadUtils.hpp
 )
 
diff --git a/src/backends/backendsCommon/CpuTensorHandle.cpp b/src/backends/backendsCommon/CpuTensorHandle.cpp
index fe0c634e7c..9dcd3f38df 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.cpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.cpp
@@ -18,7 +18,7 @@ ConstCpuTensorHandle::ConstCpuTensorHandle(const TensorInfo& tensorInfo)
 }
 
 template <>
-const void* ConstCpuTensorHandle::GetConstTensor() const
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const
 {
     return m_Memory;
 }
@@ -30,7 +30,7 @@ CpuTensorHandle::CpuTensorHandle(const TensorInfo& tensorInfo)
 }
 
 template <>
-void* CpuTensorHandle::GetTensor() const
+void* CpuTensorHandle::GetTensor<void>() const
 {
     return m_MutableMemory;
 }
diff --git a/src/backends/backendsCommon/CpuTensorHandle.hpp b/src/backends/backendsCommon/CpuTensorHandle.hpp
index ae13d6c439..b88a0d385b 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.hpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.hpp
@@ -72,6 +72,9 @@ private:
     const void* m_Memory;
 };
 
+template<>
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const;
+
 // Abstract specialization of ConstCpuTensorHandle that allows write access to the same data.
 class CpuTensorHandle : public ConstCpuTensorHandle
 {
@@ -99,6 +102,9 @@ private:
     void* m_MutableMemory;
 };
 
+template <>
+void* CpuTensorHandle::GetTensor<void>() const;
+
 // A CpuTensorHandle that owns the wrapped memory region.
 class ScopedCpuTensorHandle : public CpuTensorHandle
 {
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 8847b4efbf..1dac498c11 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -593,9 +593,10 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
 
     const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3;
 
-    //inputChannels * channelMultiplier should be equal to outputChannels.
+    // Expected weight shape: [ M, I, H, W ] - This shape does NOT depend on the data layout
+    // inputChannels * channelMultiplier should be equal to outputChannels.
     const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0];
-    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[channelIndex];
+    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1];
     const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[channelIndex];
     if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels)
     {
diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp
new file mode 100644
index 0000000000..fa387a7a0b
--- /dev/null
+++ b/src/backends/backendsCommon/WorkloadUtils.cpp
@@ -0,0 +1,111 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(tensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    TensorInfo tensorInfo = tensor->GetTensorInfo();
+
+    if (permutationVector.GetSize() > 0)
+    {
+        tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector);
+        armnnUtils::Permute(tensorInfo.GetShape(), permutationVector,
+                            tensor->GetConstTensor<void>(), permuteBuffer,
+                            GetDataTypeSize(tensorInfo.GetDataType()));
+    }
+    else
+    {
+        ::memcpy(permuteBuffer, tensor->GetConstTensor<void>(), tensorInfo.GetNumBytes());
+    }
+
+    return ConstTensor(tensorInfo, permuteBuffer);
+}
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Reshape the weights in-place
+    const TensorShape& weightShape = weightInfo.GetShape();
+    switch (dataLayout)
+    {
+        case DataLayout::NHWC:
+            // The data layout is NHWC, reshape from [ H, W, I, M ] to [ 1, H, W, I * M ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0],
+                                  weightShape[1],
+                                  weightShape[2] * weightShape[3] });
+            break;
+        case DataLayout::NCHW:
+        default:
+            // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0] * weightShape[1],
+                                  weightShape[2],
+                                  weightShape[3] });
+            break;
+    }
+}
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    TensorInfo weightPermutedInfo(weightInfo);
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        PermutationVector permutationVector{ 3, 2, 0, 1 };
+        weightPermutedInfo = armnnUtils::Permuted(weightInfo, permutationVector);
+    }
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermutedInfo, dataLayout);
+
+    // 3. Return the permuted weight info
+    return weightPermutedInfo;
+}
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(weightTensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    // If no permutation is necessary, leave the permutation vector empty
+    PermutationVector permutationVector{};
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        permutationVector = { 3, 2, 0, 1 };
+    }
+    ConstTensor weightPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer);
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermuted.GetInfo(), dataLayout);
+
+    // 3. Return both the tensor and the allocated storage to ensure that the data stays alive
+    return weightPermuted;
+}
+
+} // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp
index 2b07b2b0d2..a1a8d2a475 100644
--- a/src/backends/backendsCommon/WorkloadUtils.hpp
+++ b/src/backends/backendsCommon/WorkloadUtils.hpp
@@ -6,35 +6,42 @@
 #pragma once
 
 #include "ITensorHandle.hpp"
+#include "CpuTensorHandle.hpp"
 
 #include <armnn/Tensor.hpp>
 
+#include <Permute.hpp>
+#include <Profiling.hpp>
+#include <Half.hpp>
+
 #include <boost/cast.hpp>
 
 namespace armnn
 {
 namespace
 {
+
 template<typename ArrayType, typename Arg>
 void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
 {
- if (idx >= num)
- {
-     return;
- }
+    if (idx >= num)
+    {
+        return;
+    }
 
- arg = array[(num - 1) - idx];
- idx++;
-};
+    arg = array[(num - 1) - idx];
+    idx++;
+}
 
 template<typename T, typename ArrayType, typename ...Args>
 void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args)
 {
- AssignValues(num, idx, array, assignee);
+    AssignValues(num, idx, array, assignee);
 
- AssignValues(num, idx, array, args...);
+    AssignValues(num, idx, array, args...);
 }
-} // namespace
+
+} // anonymous namespace
 
 template<typename CopyFunc>
 void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
@@ -142,4 +149,16 @@ void GatherTensorHandlePairs(const DescriptorType& descriptor,
     }
 }
 
-} //namespace armnn
-\ No newline at end of file
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer);
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout);
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout);
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer);
+
+} //namespace armnn
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index a66b5c4581..4e79bfcd7e 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -14,7 +14,8 @@ COMMON_SOURCES := \
     MemCopyWorkload.cpp \
     OutputHandler.cpp \
     WorkloadData.cpp \
-    WorkloadFactory.cpp
+    WorkloadFactory.cpp \
+    WorkloadUtils.cpp
 
 # COMMON_TEST_SOURCES contains the list of files to be included
 # in the Android unit test build (armnn-tests) and it is picked
diff --git a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
index 37fa0f63d6..2ff66b08d5 100755
--- a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
+++ b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
@@ -327,7 +327,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const boost::multi_array<T, 4>& input,
-    const boost::multi_array<T, 4>& originalKernel,
+    const boost::multi_array<T, 4>& kernel,
     const boost::multi_array<B, 1>& bias,
     const boost::multi_array<T, 4>& outputExpected,
     float qScale,
@@ -344,10 +344,10 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
     unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[1]);
     unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[2]);
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[3]);
-    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(originalKernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(originalKernel.shape()[1]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(originalKernel.shape()[2]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(originalKernel.shape()[3]);
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
     unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
@@ -362,8 +362,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc =
-            armnnUtils::GetTensorInfo<T>(kernelChanMul, kernelChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -423,13 +422,6 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
 
     armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
 
-    // Permute the kernel if necessary
-    boost::multi_array<T, 4> kernel = boost::multi_array<T, 4>(originalKernel);
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernel.data(), kernel.data());
-    }
-
     AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
 
     armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
@@ -484,6 +476,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
     unsigned int kernelHeight = 3;
     unsigned int kernelWidth = 3;
     unsigned int kernelChannels = inputChannels;
+    unsigned int kernelDepthMultiplier = 1;
 
     unsigned int outputHeight = 1;
     unsigned int outputWidth = 1;
@@ -494,7 +487,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(1, outputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({ outputChannels }, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -543,12 +537,6 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
                     0.f, 0.f,  0.f,
                     -1.f, 0.f, -1.f,
             }));
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        std::vector<T> tmp(kernelData.size());
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, kernelData.data(), tmp.data());
-        kernelData = tmp;
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -642,8 +630,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
             inputBatchSize, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo<T>(
             outputBatchSize, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(
-            depthMultiplier, inputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({outputChannels}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -692,7 +680,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
         {0, 2, 1, -1}));
     auto bias = MakeTensor<B, 1>(biasDesc, biasV);
 
-    std::vector<T> originalKernelData = std::vector<T>(
+    std::vector<T> kernelData = std::vector<T>(
             QuantizedVector<T>(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), {
                     1, 1, 1,
                     1, -1, 1,
@@ -717,12 +705,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
                     0, 1, 0,
                     0, 0, 0,
                     0, 0, 0
+
             }));
-    std::vector<T> kernelData = originalKernelData;
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernelData.data(), kernelData.data());
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -840,9 +824,9 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestImpl(
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[2]);
 
     unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
 
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
@@ -853,7 +837,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestImpl(
     armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, armnn::GetDataType<T>());
     armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels},
                                        armnn::GetDataType<T>());
-    armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -1068,10 +1052,10 @@ LayerTestResult<T,4> CompareConvolution2dTestImpl(
     armnn::TensorInfo kernelDesc;
     armnn::TensorInfo biasDesc;
 
-    unsigned int inputShape[]    = {inputNum, inputChannels, inputHeight, inputWidth};
-    unsigned int outputShape[]   = {outputNum, outputChannels, outputHeight, outputWidth};
-    unsigned int kernelShape[]   = {outputChannels, inputChannels, kernelHeight, kernelWidth};
-    unsigned int biasShape[]     = {outputChannels};
+    unsigned int inputShape[]  = {inputNum, inputChannels, inputHeight, inputWidth};
+    unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth};
+    unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth};
+    unsigned int biasShape[]   = {outputChannels};
 
     inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
     outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
@@ -1171,19 +1155,17 @@ LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
 
     std::vector<unsigned int> inputShape;
     std::vector<unsigned int> outputShape;
-    std::vector<unsigned int> kernelShape;
-    std::vector<unsigned int> biasShape= { outputChannels };
+    std::vector<unsigned int> kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth };
+    std::vector<unsigned int> biasShape{ outputChannels };
     switch (layout.GetDataLayout())
     {
         case armnn::DataLayout::NCHW:
             inputShape =  { inputNum, inputChannels, inputHeight, inputWidth };
             outputShape = { outputNum, outputChannels, outputHeight, outputWidth };
-            kernelShape = { channelMultiplier, inputChannels, kernelHeight, kernelWidth };
             break;
         case armnn::DataLayout ::NHWC:
             inputShape =  { inputNum, inputHeight, inputWidth, inputChannels };
             outputShape = { outputNum, outputHeight, outputWidth, outputChannels };
-            kernelShape = { channelMultiplier, kernelHeight, kernelWidth, inputChannels };
             break;
         default:
             throw armnn::InvalidArgumentException("unknown data layout ["
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index ddf0d0b587..819b9d6e37 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -661,28 +661,18 @@ LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestCommon(
             24, 49
         })));
 
-    armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType<T>());
     auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
         QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), {
-             32, 16,
-             31, 15,
-             30, 14,
-             29, 13,
-
-             28, 12,
-             27, 11,
-             26, 10,
-             25,  9,
-
-             24,  8,
-             23,  7,
-             22,  6,
-             21,  5,
-
-             20,  4,
-             19,  3,
-             18,  2,
-             17,  1
+             32, 31, 30, 29,
+             28, 27, 26, 25,
+             24, 23, 22, 21,
+             20, 19, 18, 17,
+
+             16, 15, 14, 13,
+             12, 11, 10,  9,
+              8,  7,  6,  5,
+              4,  3,  2,  1
         })));
 
     armnn::TensorInfo outputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType<T>());
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
index 9cadbf09ac..1745b8297a 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
@@ -12,6 +12,7 @@
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <cl/ClTensorHandle.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/WorkloadUtils.hpp>
 
 #include <arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h>
 
@@ -21,14 +22,23 @@ namespace armnn
 using namespace armcomputetensorutils;
 
 arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
-    const TensorInfo& output,
-    const DepthwiseConvolution2dDescriptor& descriptor,
-    const TensorInfo& weights,
-    const Optional<TensorInfo>& biases)
+                                                           const TensorInfo& output,
+                                                           const DepthwiseConvolution2dDescriptor& descriptor,
+                                                           const TensorInfo& weights,
+                                                           const Optional<TensorInfo>& biases)
 {
-    const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
     const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+    // ArmNN's weight format is [ M, I, H, W ]
+    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout);
+
+    // Convert the weights into the compute library format
+    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
     arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
@@ -42,7 +52,6 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp
     }
 
     const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
-    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
 
     return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -57,10 +66,18 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
     const WorkloadInfo& info)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
+    // Allocate a buffer for the swizzling of the weight tensor
+    std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]);
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight,
+                                                                   m_Data.m_Parameters.m_DataLayout,
+                                                                   permuteBuffer.get());
 
+    // Convert the weights into the compute library format
     m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
+    BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
@@ -86,13 +103,14 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
+    // ArmNN's weight format is [ M, I, H, W ]
+    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
-    const unsigned int widthIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 3 : 2;
-    const unsigned int heightIndex = (m_Data.m_Parameters.m_DataLayout == DataLayout::NCHW) ? 2 : 1;
+    // Get the depth multiplier
+    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
-    //Check for optimisation opportunities.
-    bool use3x3Optimisation = (weightInfo.GetShape()[widthIndex] == 3) && (weightInfo.GetShape()[heightIndex] == 3);
+    // Check for optimisation opportunities.
+    bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3);
     if (use3x3Optimisation)
     {
         m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
@@ -118,7 +136,8 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload(
 
     BOOST_ASSERT(m_DepthwiseConvolutionLayer);
 
-    InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+    ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted);
+    InitializeArmComputeClTensorData(*m_KernelTensor, &weightsPermutedHandle);
 
     if (m_BiasTensor)
     {
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
index 6cad12cba8..be26359662 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp
@@ -8,10 +8,7 @@
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <neon/NeonLayerSupport.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
-
-#include <DataLayoutIndexed.hpp>
-
-using namespace armnnUtils;
+#include <backendsCommon/WorkloadUtils.hpp>
 
 namespace armnn
 {
@@ -19,17 +16,23 @@ namespace armnn
 using namespace armcomputetensorutils;
 
 arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
-    const TensorInfo& output,
-    const DepthwiseConvolution2dDescriptor& descriptor,
-    const TensorInfo& weights,
-    const Optional<TensorInfo>& biases)
+                                                             const TensorInfo& output,
+                                                             const DepthwiseConvolution2dDescriptor& descriptor,
+                                                             const TensorInfo& weights,
+                                                             const Optional<TensorInfo>& biases)
 {
-    const arm_compute::TensorInfo aclInputInfo =
-        BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclOutputInfo =
-        BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
-    const arm_compute::TensorInfo aclWeightsInfo =
-        BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclInputInfo  = BuildArmComputeTensorInfo(input,  descriptor.m_DataLayout);
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+    // ArmNN's weight format is [ M, I, H, W ]
+    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout);
+
+    // Convert the weights into the compute library format
+    const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout);
 
     arm_compute::TensorInfo aclBiasesInfo;
     arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
@@ -42,9 +45,7 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i
         optionalAclBiasesInfo = &aclBiasesInfo;
     }
 
-    const arm_compute::PadStrideInfo aclPadStrideInfo =
-        BuildArmComputePadStrideInfo(descriptor);
-    const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+    const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
 
     return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo,
                                                               &aclWeightsInfo,
@@ -59,14 +60,21 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     const WorkloadInfo& info)
     : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
-    const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+    // ArmNN's weight format is [ M, I, H, W ]
+    auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
-    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
+    // Allocate a buffer for the swizzling of the weight tensor
+    std::unique_ptr<unsigned char[]> permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]);
 
-    INeonTensorHandle* inputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
-    INeonTensorHandle* outputTensorHandle =  static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
-    DataLayoutIndexed dataLayoutIndex(m_Data.m_Parameters.m_DataLayout);
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+    ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight,
+                                                                   m_Data.m_Parameters.m_DataLayout,
+                                                                   permuteBuffer.get());
+
+    // Convert the weights into the compute library format
+    m_KernelTensor = std::make_unique<arm_compute::Tensor>();
+    BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
@@ -84,6 +92,9 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
 
     m_Data.ValidateInputsOutputs("NeonDepthwiseConvolutionWorkload", 1, 1);
 
+    INeonTensorHandle* inputTensorHandle  = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0]);
+    INeonTensorHandle* outputTensorHandle = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0]);
+
     arm_compute::ITensor& input  = inputTensorHandle->GetTensor();
     arm_compute::ITensor& output = outputTensorHandle->GetTensor();
 
@@ -91,9 +102,11 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
 
-    bool use3x3Optimisation = weightInfo.GetShape()[dataLayoutIndex.GetWidthIndex()] == 3 &&
-                              weightInfo.GetShape()[dataLayoutIndex.GetHeightIndex()] == 3;
+    // Get the depth multiplier
+    const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
+    // Check for optimisation opportunities.
+    bool use3x3Optimisation = (weightInfo.GetShape()[2] == 3) && (weightInfo.GetShape()[3] == 3);
     if (use3x3Optimisation)
     {
         m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
@@ -102,7 +115,8 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
     else
     {
@@ -112,12 +126,14 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload(
                                                            m_KernelTensor.get(),
                                                            m_BiasTensor.get(),
                                                            &output,
-                                                           padStrideInfo);
+                                                           padStrideInfo,
+                                                           depthMultiplier);
     }
 
     BOOST_ASSERT(m_pDepthwiseConvolutionLayer);
 
-    InitializeArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight);
+    ScopedCpuTensorHandle weightsPermutedHandle(weightPermuted);
+    InitializeArmComputeTensorData(*m_KernelTensor, &weightsPermutedHandle);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index 704bc368d2..5c07f57ec0 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -57,7 +57,6 @@ static void ConvImpl(ConvData data,
                      float filterScale,
                      int32_t filterOffset,
                      const BiasType* biasData,
-                     InputType* outputData,
                      float outputScale,
                      int32_t outputOffset,
                      const TensorInfo& filterInfo,
@@ -68,10 +67,10 @@ static void ConvImpl(ConvData data,
         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
     }
 
-    const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
-    const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
 
-    TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
+    TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
                                             GetOutputTensorData<InputType>(0, data),
                                             data.m_Parameters.m_DataLayout);
 
@@ -81,18 +80,18 @@ static void ConvImpl(ConvData data,
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    unsigned int depthMult      = depthwise ? filterInfo.GetShape()[0] : 1;
-    unsigned int channelsInput  = filterInfo.GetShape()[channelsIndex];
-    unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
+    unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
+    unsigned int inputChannels   = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
+    unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
 
-    unsigned int batchSize    = outputInfo0.GetShape()[0];
-    unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
-    unsigned int widthOutput  = outputInfo0.GetShape()[widthIndex];
-    unsigned int heightInput  = inputInfo0.GetShape()[heightIndex];
-    unsigned int widthInput   = inputInfo0.GetShape()[widthIndex];
+    unsigned int batchSize    = outputInfo.GetShape()[0];
+    unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
+    unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
+    unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
+    unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];
 
-    unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
-    unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
+    unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
+    unsigned int filterWidth  = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
 
     unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
@@ -102,68 +101,56 @@ static void ConvImpl(ConvData data,
     // The world's least efficient convolution.
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
-        for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
+        for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
         {
-            for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
+            for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
             {
-                for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
+                for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
                 {
                     // This loop goes over each output element.
                     AccumulatorType sum = AccumulatorType();
 
                     // For depthwise, each output channel corresponds to exactly one input channel.
                     // For normal, must loop over each input channel.
-                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
+                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
                         unsigned int depthwiseMultiplierIdx = 0;
                         if (depthwise)
                         {
-                            cInput = cOutput / depthMult;
-                            depthwiseMultiplierIdx = cOutput % depthMult;
+                            cInput = cOutput / depthMultiplier;
+                            depthwiseMultiplierIdx = cOutput % depthMultiplier;
                         }
 
-                        for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
+                        for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
                         {
-                            for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
+                            for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
                             {
                                 // This loop goes over each input element for each output element.
 
-                                unsigned int filterIndex;
+                                unsigned int filterIndex = 0;
 
                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
-                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
-                                                        * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
-                                                      cInput;
-                                    }
-                                    else
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
-                                                        * channelsInput +
-                                                      cInput * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
-                                                      xFilter;
-                                    }
+                                    filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
+                                                  cInput * filterWidth * filterHeight +
+                                                  yFilter * filterWidth +
+                                                  xFilter;
                                 }
                                 else
                                 {
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
+                                        filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
+                                                      yFilter * filterWidth * inputChannels +
+                                                      xFilter * inputChannels +
                                                       cInput;
                                     }
                                     else
                                     {
-                                        filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
-                                                      cInput  * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
+                                        filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
+                                                      cInput  * filterWidth * filterHeight +
+                                                      yFilter * filterWidth +
                                                       xFilter;
                                     }
                                 }
@@ -177,8 +164,8 @@ static void ConvImpl(ConvData data,
                                 AccumulatorType inputValue;
 
                                 // Check if we're in the padding.
-                                if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
-                                    xInput < paddingLeft || xInput >= widthInput + paddingLeft )
+                                if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
+                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
                                 {
                                     inputValue = AccumulatorType();
                                 }
@@ -188,17 +175,17 @@ static void ConvImpl(ConvData data,
 
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        inputIndex = batchIdx * heightInput * widthInput  * channelsInput +
-                                                     (yInput - paddingTop) * widthInput * channelsInput +
-                                                     (xInput - paddingLeft) * channelsInput +
+                                        inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
+                                                     (yInput - paddingTop) * inputWidth * inputChannels +
+                                                     (xInput - paddingLeft) * inputChannels +
                                                      cInput;
 
                                     }
                                     else
                                     {
-                                        inputIndex = batchIdx * widthInput * heightInput * channelsInput +
-                                                     widthInput * heightInput * cInput +
-                                                     widthInput * (yInput - paddingTop) +
+                                        inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
+                                                     inputWidth * inputHeight * cInput +
+                                                     inputWidth * (yInput - paddingTop) +
                                                      xInput - paddingLeft;
                                     }
 
diff --git a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
index 20905646d7..7b298df967 100644
--- a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
-    const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* filterData = m_Weight->template GetConstTensor<float>();
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::Convolution2dQueueDescriptor, float, float, float>(
-        m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo);
+        m_Data, inputData, 0.0f, 0, filterData, 0.0f, 0, biasData, 0.0f, 0, filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
index 881e9bf6b0..af2c7ad0d6 100644
--- a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
@@ -27,10 +27,7 @@ void RefConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -39,7 +36,7 @@ void RefConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
index e89013b9bd..756e958753 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
     const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, float, float, float>
-        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true);
+        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, 0.0f, 0, filterInfo, true);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
index e8e501d6ae..629b729ea6 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
@@ -28,10 +28,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -40,7 +37,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
 }
 
 } //namespace armnn
author	Matteo Martincigh <matteo.martincigh@arm.com>	2018-12-18 09:26:39 +0000
committer	Matteo Martincigh <matteo.martincigh@arm.com>	2019-01-04 17:28:07 +0000
commit	747ef82c88f9afe14a8b80b6b3b34118353e97f2 (patch)
tree	a29ac33b84fb96a41103a0a97327189495374cc9 /src/backends
parent	760892724d131c7da4b9baad05cddd49276ad6bb (diff)
download	armnn-747ef82c88f9afe14a8b80b6b3b34118353e97f2.tar.gz