From 747ef82c88f9afe14a8b80b6b3b34118353e97f2 Mon Sep 17 00:00:00 2001
From: Matteo Martincigh <matteo.martincigh@arm.com>
Date: Tue, 18 Dec 2018 09:26:39 +0000
Subject: MLCE-77 Depthwise Convolution with depth multiplier > 1 doesn't work

 * Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution
 * Added conversion utilities to permute/reshape the weights as appropriate
   when using CL and Neon backends
 * Updated the reference implementation of the convolution
 * Updated the relevant unit tests accordingly

!android-nn-driver:459

Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18
---
 src/backends/reference/workloads/ConvImpl.hpp      | 93 ++++++++++------------
 .../workloads/RefConvolution2dFloat32Workload.cpp  |  8 +-
 .../workloads/RefConvolution2dUint8Workload.cpp    |  7 +-
 .../RefDepthwiseConvolution2dFloat32Workload.cpp   |  6 +-
 .../RefDepthwiseConvolution2dUint8Workload.cpp     |  7 +-
 5 files changed, 49 insertions(+), 72 deletions(-)

(limited to 'src/backends/reference/workloads')
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index 704bc368d2..5c07f57ec0 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -57,7 +57,6 @@ static void ConvImpl(ConvData data,
                      float filterScale,
                      int32_t filterOffset,
                      const BiasType* biasData,
-                     InputType* outputData,
                      float outputScale,
                      int32_t outputOffset,
                      const TensorInfo& filterInfo,
@@ -68,10 +67,10 @@ static void ConvImpl(ConvData data,
         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
     }
 
-    const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
-    const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
+    const TensorInfo& inputInfo  = GetTensorInfo(data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
 
-    TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
+    TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
                                             GetOutputTensorData<InputType>(0, data),
                                             data.m_Parameters.m_DataLayout);
 
@@ -81,18 +80,18 @@ static void ConvImpl(ConvData data,
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    unsigned int depthMult      = depthwise ? filterInfo.GetShape()[0] : 1;
-    unsigned int channelsInput  = filterInfo.GetShape()[channelsIndex];
-    unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
+    unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
+    unsigned int inputChannels   = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
+    unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
 
-    unsigned int batchSize    = outputInfo0.GetShape()[0];
-    unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
-    unsigned int widthOutput  = outputInfo0.GetShape()[widthIndex];
-    unsigned int heightInput  = inputInfo0.GetShape()[heightIndex];
-    unsigned int widthInput   = inputInfo0.GetShape()[widthIndex];
+    unsigned int batchSize    = outputInfo.GetShape()[0];
+    unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
+    unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
+    unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
+    unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];
 
-    unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
-    unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
+    unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
+    unsigned int filterWidth  = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
 
     unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
@@ -102,68 +101,56 @@ static void ConvImpl(ConvData data,
     // The world's least efficient convolution.
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
-        for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
+        for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
         {
-            for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
+            for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
             {
-                for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
+                for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
                 {
                     // This loop goes over each output element.
                     AccumulatorType sum = AccumulatorType();
 
                     // For depthwise, each output channel corresponds to exactly one input channel.
                     // For normal, must loop over each input channel.
-                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
+                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
                         unsigned int depthwiseMultiplierIdx = 0;
                         if (depthwise)
                         {
-                            cInput = cOutput / depthMult;
-                            depthwiseMultiplierIdx = cOutput % depthMult;
+                            cInput = cOutput / depthMultiplier;
+                            depthwiseMultiplierIdx = cOutput % depthMultiplier;
                         }
 
-                        for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
+                        for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
                         {
-                            for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
+                            for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
                             {
                                 // This loop goes over each input element for each output element.
 
-                                unsigned int filterIndex;
+                                unsigned int filterIndex = 0;
 
                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
-                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
-                                                        * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
-                                                      cInput;
-                                    }
-                                    else
-                                    {
-                                        filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
-                                                        * channelsInput +
-                                                      cInput * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
-                                                      xFilter;
-                                    }
+                                    filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
+                                                  cInput * filterWidth * filterHeight +
+                                                  yFilter * filterWidth +
+                                                  xFilter;
                                 }
                                 else
                                 {
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
-                                                      yFilter * widthFilter * channelsInput +
-                                                      xFilter * channelsInput +
+                                        filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
+                                                      yFilter * filterWidth * inputChannels +
+                                                      xFilter * inputChannels +
                                                       cInput;
                                     }
                                     else
                                     {
-                                        filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
-                                                      cInput  * widthFilter * heightFilter +
-                                                      yFilter * widthFilter +
+                                        filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
+                                                      cInput  * filterWidth * filterHeight +
+                                                      yFilter * filterWidth +
                                                       xFilter;
                                     }
                                 }
@@ -177,8 +164,8 @@ static void ConvImpl(ConvData data,
                                 AccumulatorType inputValue;
 
                                 // Check if we're in the padding.
-                                if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
-                                    xInput < paddingLeft || xInput >= widthInput + paddingLeft )
+                                if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
+                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
                                 {
                                     inputValue = AccumulatorType();
                                 }
@@ -188,17 +175,17 @@ static void ConvImpl(ConvData data,
 
                                     if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
                                     {
-                                        inputIndex = batchIdx * heightInput * widthInput  * channelsInput +
-                                                     (yInput - paddingTop) * widthInput * channelsInput +
-                                                     (xInput - paddingLeft) * channelsInput +
+                                        inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
+                                                     (yInput - paddingTop) * inputWidth * inputChannels +
+                                                     (xInput - paddingLeft) * inputChannels +
                                                      cInput;
 
                                     }
                                     else
                                     {
-                                        inputIndex = batchIdx * widthInput * heightInput * channelsInput +
-                                                     widthInput * heightInput * cInput +
-                                                     widthInput * (yInput - paddingTop) +
+                                        inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
+                                                     inputWidth * inputHeight * cInput +
+                                                     inputWidth * (yInput - paddingTop) +
                                                      xInput - paddingLeft;
                                     }
 
diff --git a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
index 20905646d7..7b298df967 100644
--- a/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
-    const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* filterData = m_Weight->template GetConstTensor<float>();
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::Convolution2dQueueDescriptor, float, float, float>(
-        m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo);
+        m_Data, inputData, 0.0f, 0, filterData, 0.0f, 0, biasData, 0.0f, 0, filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
index 881e9bf6b0..af2c7ad0d6 100644
--- a/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dUint8Workload.cpp
@@ -27,10 +27,7 @@ void RefConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -39,7 +36,7 @@ void RefConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
index e89013b9bd..756e958753 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dFloat32Workload.cpp
@@ -23,15 +23,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dFloat32Workload_Execute");
 
-    float*       outputData = GetOutputTensorDataFloat(0, m_Data);
     const float* inputData  = GetInputTensorDataFloat(0, m_Data);
     const float* weightData = m_Weight->template GetConstTensor<float>();
-    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<float>() : nullptr;
+    const float* biasData   = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<float>() : nullptr;
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
     ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, float, float, float>
-        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true);
+        (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, 0.0f, 0, filterInfo, true);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
index e8e501d6ae..629b729ea6 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dUint8Workload.cpp
@@ -28,10 +28,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
     const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>();
     const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get());
-    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ?
-        m_Bias->template GetConstTensor<int32_t>() :
-        nullptr;
-    uint8_t* outputData = GetOutputTensorDataU8(0, m_Data);
+    const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->template GetConstTensor<int32_t>() : nullptr;
     const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& filterInfo = m_Weight->GetTensorInfo();
 
@@ -40,7 +37,7 @@ void RefDepthwiseConvolution2dUint8Workload::Execute() const
         inputData, inputInfo.GetQuantizationScale(),  inputInfo.GetQuantizationOffset(),
         weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(),
         biasData,
-        outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
+        outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true);
 }
 
 } //namespace armnn
-- 
cgit v1.2.1