From d8df0260ced49a2796ff70e96284cf00eb316bcc Mon Sep 17 00:00:00 2001
From: Teresa Charlin <teresa.charlinreyes@arm.com>
Date: Mon, 11 Nov 2019 12:28:15 +0000
Subject: IVGCVSW-4079 Add support of per-axis quantization to
 DepthwiseConvolution2d

!android-nn-driver:2260

Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: Iad93c1940568ffa65ed314c8871ea66caf4f9e4a
---
 src/backends/backendsCommon/WorkloadData.cpp       |  12 +-
 .../test/layerTests/Conv2dTestImpl.cpp             | 108 +++++++++++++
 .../test/layerTests/Conv2dTestImpl.hpp             |   5 +
 src/backends/reference/RefLayerSupport.cpp         |  25 ++-
 src/backends/reference/test/RefLayerTests.cpp      |   6 +-
 src/backends/reference/workloads/ConvImpl.hpp      | 177 ---------------------
 6 files changed, 147 insertions(+), 186 deletions(-)
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 6222ba4800..0a2b27afbf 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1210,18 +1210,24 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa
                                      numWeightInputChannels % numWeightChannelMultiplier));
     }
 
-    ValidateTensorDataTypesMatch(inputTensorInfo, weightTensorInfo, descriptorName, "input", "weight");
+    ValidateWeightDataType(inputTensorInfo, weightTensorInfo, descriptorName);
 
+    Optional<TensorInfo> optionalBiasTensorInfo;
     if (m_Parameters.m_BiasEnabled)
     {
         ValidatePointer(m_Bias, descriptorName, "bias");
 
-        const TensorInfo& biasTensorInfo = m_Bias->GetTensorInfo();
-        ValidateTensorNumDimensions(biasTensorInfo, descriptorName, 1, "bias");
+        optionalBiasTensorInfo = MakeOptional<TensorInfo>(m_Bias->GetTensorInfo());
+        const TensorInfo& biasTensorInfo = optionalBiasTensorInfo.value();
 
         ValidateBiasTensorQuantization(biasTensorInfo, inputTensorInfo, weightTensorInfo, descriptorName);
         ValidateTensorDataType(biasTensorInfo, GetBiasDataType(inputTensorInfo.GetDataType()), descriptorName, "bias");
     }
+    ValidatePerAxisQuantization(inputTensorInfo,
+                                outputTensorInfo,
+                                weightTensorInfo,
+                                optionalBiasTensorInfo,
+                                descriptorName);
 
     std::vector<DataType> supportedTypes =
     {
diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
index 5fac09f5b3..22e7e29db7 100644
--- a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
@@ -3260,6 +3260,114 @@ LayerTestResult<int16_t, 4> DepthwiseConvolution2dDepthMul1Int16Test(
         workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
 }
 
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dPerAxisQuantTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout layout)
+{
+    using namespace armnn;
+
+    const DataType inputType  = DataType::QuantisedAsymm8;
+    const DataType kernelType = DataType::QuantizedSymm8PerAxis;
+    const DataType biasType   = DataType::Signed32;
+
+    TensorInfo inputInfo ({ 1, 3, 3, 2 }, inputType, 0.5f, 128); // N H W C
+    TensorInfo outputInfo({ 1, 2, 2, 4 }, inputType, 1.0f, 128); // N H W C
+
+    const std::vector<float> quantScales{ 1.0f, 0.5f, 1.0f, 0.5f };
+    const unsigned int quantDimension = 0;
+    TensorInfo kernelInfo({ 2, 2, 2, 2 }, kernelType, quantScales, quantDimension); // M I H W
+
+    const std::vector<float> biasQuantScales{ 0.5f, 0.25f, 0.5f, 0.25f };
+    constexpr unsigned int biasQuantDimension = 0;
+    TensorInfo biasInfo({ 4 }, biasType, biasQuantScales, biasQuantDimension);
+
+    std::vector<uint8_t> inputData =
+    {
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130,
+        129, 130
+    };
+
+    std::vector<int8_t> kernelData =
+    {
+        1, 1, 1, 1,
+        1, 1, 1, 1,
+        1, 1, 1, 1,
+        1, 1, 1, 1
+    };
+
+    std::vector<int32_t> biasData =
+    {
+        4, 4, 4, 4
+    };
+
+    std::vector<uint8_t> expectedOutputData =
+    {
+        132, 130, 134, 131,
+        132, 130, 134, 131,
+        132, 130, 134, 131,
+        132, 130, 134, 131
+    };
+
+    if (layout == DataLayout::NCHW)
+    {
+        PermuteTensorNhwcToNchw(inputInfo, inputData);
+        PermuteTensorNhwcToNchw(outputInfo, expectedOutputData);
+    }
+
+    DepthwiseConvolution2dDescriptor descriptor;
+    descriptor.m_StrideX     = 1;
+    descriptor.m_StrideY     = 1;
+    descriptor.m_PadLeft     = 0;
+    descriptor.m_PadRight    = 0;
+    descriptor.m_PadTop      = 0;
+    descriptor.m_PadBottom   = 0;
+    descriptor.m_DilationX   = 1;
+    descriptor.m_DilationY   = 1;
+    descriptor.m_BiasEnabled = true;
+    descriptor.m_DataLayout  = layout;
+
+    std::unique_ptr<ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputInfo);
+    std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputInfo);
+
+    WorkloadInfo workloadInfo;
+    ScopedCpuTensorHandle weightTensor(kernelInfo);
+    ScopedCpuTensorHandle biasTensor(biasInfo);
+
+    AllocateAndCopyDataToITensorHandle(&weightTensor, kernelData.data());
+    AllocateAndCopyDataToITensorHandle(&biasTensor, biasData.data());
+
+    DepthwiseConvolution2dQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = descriptor;
+    queueDescriptor.m_Weight     = &weightTensor;
+    queueDescriptor.m_Bias       = &biasTensor;
+
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputInfo, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputInfo, outputHandle.get());
+
+    std::unique_ptr<IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(queueDescriptor, workloadInfo);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), inputData.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    LayerTestResult<uint8_t, 4> ret(outputInfo);
+
+    CopyDataFromITensorHandle(ret.output.origin(), outputHandle.get());
+    ret.outputExpected = MakeTensor<uint8_t, 4>(outputInfo, expectedOutputData);
+
+    return ret;
+}
+
 LayerTestResult<float, 4> CompareDepthwiseConvolution2dFloatTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp
index 3aac975b3b..69bfa97281 100644
--- a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp
+++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp
@@ -210,6 +210,11 @@ LayerTestResult<int16_t, 4> DepthwiseConvolution2dDepthMul1Int16Test(
     bool biasEnabled,
     const armnn::DataLayout layout);
 
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dPerAxisQuantTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
 LayerTestResult<float, 4> CompareDepthwiseConvolution2dFloatTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index e98af7097b..3507162de8 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -565,14 +565,29 @@ bool RefLayerSupport::IsDepthwiseConvolutionSupported(const TensorInfo& input,
     supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
                                   "Reference DepthwiseConvolution2d: output is not a supported type.");
 
-    supported &= CheckSupportRule(TypeAnyOf(weights, supportedTypes), reasonIfUnsupported,
-                                  "Reference DepthwiseConvolution2d: weights is not a supported type.");
-
     supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported,
                                   "Reference DepthwiseConvolution2d: input and output types mismatched.");
 
-    supported &= CheckSupportRule(TypesAreEqual(input, weights), reasonIfUnsupported,
-                                  "Reference DepthwiseConvolution2d: input and weights types mismatched.");
+    const DataType inputType = input.GetDataType();
+    if (inputType == DataType::QuantisedAsymm8)
+    {
+        std::array<DataType, 2> supportedWeightTypes =
+        {
+            DataType::QuantisedAsymm8,
+            DataType::QuantizedSymm8PerAxis
+        };
+
+        supported &= CheckSupportRule(TypeAnyOf(weights, supportedWeightTypes), reasonIfUnsupported,
+                                      "Reference convolution2d: weights type not supported for quantized input.");
+    }
+    else
+    {
+        supported &= CheckSupportRule(TypeAnyOf(weights, supportedTypes), reasonIfUnsupported,
+                                      "Reference DepthwiseConvolution2d: weights is not a supported type.");
+
+        supported &= CheckSupportRule(TypesAreEqual(input, weights), reasonIfUnsupported,
+                                      "Reference DepthwiseConvolution2d: input and weights types mismatched.");
+    }
 
     if (biases.has_value())
     {
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index cd5c9273f7..c5986e0b12 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -159,7 +159,6 @@ ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dUint8,
                      DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dQSymm16, DepthwiseConvolution2dInt16Test, true, DataLayout::NCHW)
 
-// NHWC Depthwise Convolution
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dNhwc, DepthwiseConvolution2dTest, true, DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dUint8Nhwc, DepthwiseConvolution2dUint8Test, true, DataLayout::NHWC)
 
@@ -263,6 +262,11 @@ ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dAsymmetricNhwc,
 
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthMul64, DepthwiseConvolution2dDepthMul64Test);
 
+ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dPerAxisQuantTestNchw, DepthwiseConvolution2dPerAxisQuantTest,
+                     DataLayout::NCHW);
+ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dPerAxisQuantTestNhwc, DepthwiseConvolution2dPerAxisQuantTest,
+                     DataLayout::NHWC);
+
 // Pooling
 //MaxPooling
 ARMNN_AUTO_TEST_CASE(SimpleMaxPooling2dSize2x2Stride2x2, SimpleMaxPooling2dSize2x2Stride2x2Test, false)
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index 3551ba8f90..7dba760d87 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -50,183 +50,6 @@ private:
     int32_t m_RightShift;
 };
 
-/// An implementation shared by normal and depthwise convolution.
-template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
-static void ConvImpl(ConvData data,
-                     const InputType* inputData,
-                     float inputScale,
-                     int32_t inputOffset,
-                     const InputType* filterData,
-                     float filterScale,
-                     int32_t filterOffset,
-                     const BiasType* biasData,
-                     float outputScale,
-                     int32_t outputOffset,
-                     const TensorInfo& filterInfo,
-                     bool depthwise = false)
-{
-    if (data.m_Parameters.m_BiasEnabled && !biasData)
-    {
-        throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
-    }
-
-    const TensorInfo& inputInfo  = GetTensorInfo(data.m_Inputs[0]);
-    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
-
-    TensorBufferArrayView<InputType> output(outputInfo.GetShape(),
-                                            GetOutputTensorData<InputType>(0, data),
-                                            data.m_Parameters.m_DataLayout);
-
-    const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
-
-    const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
-    const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
-    const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
-
-    unsigned int depthMultiplier = depthwise ? filterInfo.GetShape()[0] : 1;
-    unsigned int inputChannels   = depthwise ? filterInfo.GetShape()[1] : filterInfo.GetShape()[channelsIndex];
-    unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : filterInfo.GetShape()[0];
-
-    unsigned int batchSize    = outputInfo.GetShape()[0];
-    unsigned int outputHeight = outputInfo.GetShape()[heightIndex];
-    unsigned int outputWidth  = outputInfo.GetShape()[widthIndex];
-    unsigned int inputHeight  = inputInfo.GetShape()[heightIndex];
-    unsigned int inputWidth   = inputInfo.GetShape()[widthIndex];
-
-    unsigned int filterHeight = depthwise ? filterInfo.GetShape()[2] : filterInfo.GetShape()[heightIndex];
-    unsigned int filterWidth  = depthwise ? filterInfo.GetShape()[3] : filterInfo.GetShape()[widthIndex];
-
-    unsigned int paddingTop  = data.m_Parameters.m_PadTop;
-    unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
-    unsigned int xStride     = data.m_Parameters.m_StrideX;
-    unsigned int yStride     = data.m_Parameters.m_StrideY;
-    unsigned int xDilation   = data.m_Parameters.m_DilationX;
-    unsigned int yDilation   = data.m_Parameters.m_DilationY;
-
-    // The world's least efficient convolution.
-    for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
-    {
-        for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
-        {
-            for (unsigned int yOutput = 0; yOutput < outputHeight; yOutput++)
-            {
-                for (unsigned int xOutput = 0; xOutput < outputWidth; xOutput++)
-                {
-                    // This loop goes over each output element.
-                    AccumulatorType sum = AccumulatorType();
-
-                    // For depthwise, each output channel corresponds to exactly one input channel.
-                    // For normal, must loop over each input channel.
-                    for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
-                    {
-                        unsigned int depthwiseMultiplierIdx = 0;
-                        if (depthwise)
-                        {
-                            cInput = cOutput / depthMultiplier;
-                            depthwiseMultiplierIdx = cOutput % depthMultiplier;
-                        }
-
-                        for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
-                        {
-                            for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
-                            {
-                                // This loop goes over each input element for each output element.
-
-                                unsigned int filterIndex = 0;
-
-                                // Since dimensionality of kernel depends on depthwiseness, so does index.
-                                if (depthwise)
-                                {
-                                    filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
-                                                  cInput * filterWidth * filterHeight +
-                                                  yFilter * filterWidth +
-                                                  xFilter;
-                                }
-                                else
-                                {
-                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
-                                    {
-                                        filterIndex = cOutput * filterHeight * filterWidth * inputChannels +
-                                                      yFilter * filterWidth * inputChannels +
-                                                      xFilter * inputChannels +
-                                                      cInput;
-                                    }
-                                    else
-                                    {
-                                        filterIndex = cOutput * filterWidth * filterHeight * inputChannels +
-                                                      cInput  * filterWidth * filterHeight +
-                                                      yFilter * filterWidth +
-                                                      xFilter;
-                                    }
-                                }
-
-                                AccumulatorType filterValue = filterData[filterIndex] -
-                                    boost::numeric_cast<AccumulatorType>(filterOffset);
-
-                                unsigned int yInput = yOutput * yStride + yFilter * yDilation;
-                                unsigned int xInput = xOutput * xStride + xFilter * xDilation;
-
-                                AccumulatorType inputValue;
-
-                                // Check if we're in the padding.
-                                if (yInput < paddingTop || yInput >= inputHeight + paddingTop ||
-                                    xInput < paddingLeft || xInput >= inputWidth + paddingLeft )
-                                {
-                                    inputValue = AccumulatorType();
-                                }
-                                else
-                                {
-                                    unsigned int inputIndex;
-
-                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
-                                    {
-                                        inputIndex = batchIdx * inputHeight * inputWidth  * inputChannels +
-                                                     (yInput - paddingTop) * inputWidth * inputChannels +
-                                                     (xInput - paddingLeft) * inputChannels +
-                                                     cInput;
-
-                                    }
-                                    else
-                                    {
-                                        inputIndex = batchIdx * inputWidth * inputHeight * inputChannels +
-                                                     inputWidth * inputHeight * cInput +
-                                                     inputWidth * (yInput - paddingTop) +
-                                                     xInput - paddingLeft;
-                                    }
-
-                                    inputValue = inputData[inputIndex] -
-                                                    boost::numeric_cast<AccumulatorType>(inputOffset);
-
-                                }
-                                sum += filterValue * inputValue;
-                            }
-                        }
-                    }
-
-                    if (data.m_Parameters.m_BiasEnabled)
-                    {
-                        sum += biasData[cOutput];
-                    }
-
-                    if (outputScale != 0.0f)
-                    {
-                        float multiplier = (inputScale * filterScale) / outputScale;
-                        // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
-                        // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
-                        //  sum = std::round(multiplier * sum + outputOffset);
-                        sum = boost::numeric_cast<AccumulatorType>(
-                                QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
-                            + boost::numeric_cast<AccumulatorType>(outputOffset);
-                        sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
-                    }
-
-                    output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
-                }
-            }
-        }
-    }
-}
-
 void Convolve(const TensorShape& rInputShape,
               Decoder<float>& rInputDecoder,
               const TensorShape& rOutputShape,
-- 
cgit v1.2.1