From 53ef79504b4c881c572735393c2eede5fa556c46 Mon Sep 17 00:00:00 2001 From: Jan Eilers Date: Wed, 2 Jun 2021 12:01:25 +0100 Subject: IVGCVSW-5826 Change weights layout for depthwise to [1,H,W,I*M] * This change is necessary because tflite uses a [1,H,W,I*M] format and uses the I*M dimension for per axis quantization. Our previous layout [M,I,H,W] can't handle the correlating quantization scales. * Updates Onnx-, TfLiteParser and TfliteDelegate * Updates the CpuRef, CpuAcc and GpuAcc backends * Adjusts unit tests * Adds test to ensure models with old layout can still be read and executed * Adds conversion function to previous layout [1,H,W,I*M] --> [M,I,H,W] which can be used by backend developers !android-nn-driver:5553 Signed-off-by: Jan Eilers Change-Id: Ifef23368b8c3702cf315a5838d214f7dc13c0152 --- src/backends/backendsCommon/WorkloadData.cpp | 38 ++-- src/backends/backendsCommon/WorkloadData.hpp | 14 +- src/backends/backendsCommon/WorkloadUtils.cpp | 94 ++++++++ src/backends/backendsCommon/WorkloadUtils.hpp | 34 +++ .../test/layerTests/Conv2dTestImpl.cpp | 194 ++++++---------- .../workloads/ClDepthwiseConvolutionWorkload.cpp | 32 ++- src/backends/neon/test/NeonLayerTests.cpp | 16 +- .../workloads/NeonDepthwiseConvolutionWorkload.cpp | 35 ++- src/backends/reference/test/CMakeLists.txt | 2 + .../reference/test/RefPerAxisIteratorTests.cpp | 252 +++++++++++++++++++++ .../reference/test/RefPerChannelDecoderTests.cpp | 156 +++++++++++++ src/backends/reference/workloads/BaseIterator.hpp | 180 +++++++-------- src/backends/reference/workloads/ConvImpl.cpp | 31 ++- src/backends/reference/workloads/Decoders.hpp | 16 +- .../reference/workloads/TransposeConvolution2d.cpp | 2 +- 15 files changed, 781 insertions(+), 315 deletions(-) create mode 100644 src/backends/reference/test/RefPerAxisIteratorTests.cpp create mode 100644 src/backends/reference/test/RefPerChannelDecoderTests.cpp (limited to 'src/backends') diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index be0ac707a8..44a6a17b37 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -390,13 +390,6 @@ void ValidatePerAxisQuantizationDimension(const TensorInfo& tensorInfo, throw InvalidArgumentException(fmt::format("{0}: Quantization dimension for per-axis quantization " "not set on tensor {1}.", descName, tensorName)); } - - if (quantizationDim.value() != 0) - { - throw InvalidArgumentException(fmt::format( - "{0}: Quantization dimension for per-axis quantization expected to be 0 on tensor {1}, " - "but got: {2}", descName, tensorName, quantizationDim.value())); - } } void ValidatePerAxisQuantizationOffset(const TensorInfo& tensorInfo, @@ -1386,17 +1379,32 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3; - // Expected weight shape: [ M, I, H, W ] - This shape does NOT depend on the data layout + // Expected weight shape: [ 1, H, W, I*M ] - This shape does NOT depend on the data layout // inputChannels * channelMultiplier should be equal to outputChannels. - const unsigned int numWeightChannelMultiplier = weightTensorInfo.GetShape()[0]; - const unsigned int numWeightInputChannels = weightTensorInfo.GetShape()[1]; - const unsigned int numWeightOutputChannels = outputTensorInfo.GetShape()[channelIndex]; - if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels) + const unsigned int numWeightOutputChannels = weightTensorInfo.GetShape()[3]; // I*M=Cout + const unsigned int numOutputChannels = outputTensorInfo.GetShape()[channelIndex]; + if (numWeightOutputChannels != numOutputChannels) + { + throw InvalidArgumentException(fmt::format( + "{0}: The weight format in armnn is expected to be [1, H, W, Cout]." + "But 4th dimension is not equal to Cout. Cout = {1} Provided weight shape: [{2}, {3}, {4}, {5}]", + descriptorName, + numOutputChannels, + weightTensorInfo.GetShape()[0], + weightTensorInfo.GetShape()[1], + weightTensorInfo.GetShape()[2], + weightTensorInfo.GetShape()[3])); + } + if (weightTensorInfo.GetShape()[0] != 1) { throw InvalidArgumentException(fmt::format( - "{0}: output_channels (provided {1}) should be equal to input_channels (provided {2}) " - "multiplied by channel_multiplier (provided {3}).", - descriptorName, numWeightOutputChannels, numWeightInputChannels, numWeightChannelMultiplier)); + "{0}: The weight format in armnn is expected to be [1, H, W, Cout]." + "But first dimension is not equal to 1. Provided weight shape: [{1}, {2}, {3}, {4}]", + descriptorName, + weightTensorInfo.GetShape()[0], + weightTensorInfo.GetShape()[1], + weightTensorInfo.GetShape()[2], + weightTensorInfo.GetShape()[3])); } ValidateWeightDataType(inputTensorInfo, weightTensorInfo, descriptorName); diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp index 77d4209657..11ce2cb44f 100644 --- a/src/backends/backendsCommon/WorkloadData.hpp +++ b/src/backends/backendsCommon/WorkloadData.hpp @@ -208,7 +208,19 @@ struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters [H, W, I, M], won't work without taking care of the +/// corresponding quantization scales. +/// If there is no per channel quantization applied reshaping the weights tensor won't cause any issues. There are +/// preconfigured permutation functions available @link WorkloadUtils.hpp here. +/// struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters { DepthwiseConvolution2dQueueDescriptor() diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp index c8105aea04..bd7f09b28a 100644 --- a/src/backends/backendsCommon/WorkloadUtils.cpp +++ b/src/backends/backendsCommon/WorkloadUtils.cpp @@ -7,6 +7,9 @@ #include #include +#include + +#include namespace armnn { @@ -107,6 +110,7 @@ ConstTensor ReorderWeightChannelsForAcl(const ConstTensor& weightHandle, DataLay return ConstTensor(weightHandle.GetInfo(), permuteBuffer); } + TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout) { // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either @@ -130,6 +134,96 @@ TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, D return weightPermutedInfo; } + +std::tuple Convert1HWOTensorToAcl(const ConstTensorHandle* weightTensor, + const TensorInfo& inputInfo, + const DataLayout dataLayout, + void* permuteBuffer) +{ + TensorInfo weightsInfo = weightTensor->GetTensorInfo(); + unsigned int depthMultiplier = 1; + PermutationVector permutationVector{}; + if (dataLayout == armnn::DataLayout::NHWC) + { + // No permutation required. Data layouts are the same. + + depthMultiplier = weightsInfo.GetShape()[3] / inputInfo.GetShape()[3]; + } + else if (dataLayout == armnn::DataLayout::NCHW) + { + // [ 1, H, W, I*M] --> [ 1, I * M, H, W ] + depthMultiplier = weightsInfo.GetShape()[3] / inputInfo.GetShape()[1]; + permutationVector = { 0, 2, 3, 1 }; + } + else + { + throw InvalidArgumentException(fmt::format("Unknown data layout for tensor conversion: {}", + GetDataLayoutName(dataLayout))); + } + + ConstTensor weightsPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer); + + return std::make_tuple(weightsPermuted, depthMultiplier); +} + +std::tuple Convert1HWOTensorInfoToAcl(const TensorInfo& weightInfo, + const TensorInfo& inputInfo, + const DataLayout dataLayout) +{ + unsigned int aclDepthMultiplier = 1; + TensorInfo weightsPermuted; + if (dataLayout == armnn::DataLayout::NHWC) + { + // No permutation required. Data layouts are the same. + aclDepthMultiplier = weightInfo.GetShape()[3] / inputInfo.GetShape()[3]; + weightsPermuted = weightInfo; + } + else if (dataLayout == armnn::DataLayout::NCHW) + { + // [ 1, H, W, I*M] --> [ 1, I * M, H, W ] + aclDepthMultiplier = weightInfo.GetShape()[3] / inputInfo.GetShape()[1]; + PermutationVector permutationVector{ 0, 2, 3, 1 }; + weightsPermuted = armnnUtils::Permuted(weightInfo, permutationVector); + } + else + { + throw InvalidArgumentException(fmt::format("Unknown data layout for tensor info conversion: {}", + GetDataLayoutName(dataLayout))); + } + + return std::make_tuple(weightsPermuted, aclDepthMultiplier); +} + + +std::tuple Convert1HWOtoMIHW(const ConstTensorHandle* weightTensor, + const TensorInfo& inputInfo, + const DataLayout& dataLayout, + void* permuteBuffer) +{ + TensorInfo weightsInfo = weightTensor->GetTensorInfo(); + + if (weightsInfo.HasPerAxisQuantization()) + { + throw InvalidArgumentException("Can't convert tensor from [1,H,W,Cout] to [M,Cin,H,W] when per channel " + "quantization is applied."); + } + + // Reshape weights [ 1, H, W, I*M ] --> [ H, W, I, M ] + auto weightsShape = weightsInfo.GetShape(); + auto channelIndex = armnnUtils::DataLayoutIndexed(dataLayout).GetChannelsIndex(); + unsigned int depthMultiplier = weightsShape[3] / inputInfo.GetShape()[channelIndex]; + weightsInfo.SetShape({ weightsShape[1], + weightsShape[2], + inputInfo.GetShape()[channelIndex], + depthMultiplier}); + + // Permute [ H, W, I, M ] --> [ M, I, H, W ] + PermutationVector permutationVector = { 2, 3, 1, 0 }; + ConstTensor weightsPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer); + + return std::make_tuple(weightsPermuted, depthMultiplier); +} + armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstTensorHandle* weightTensor, DataLayout dataLayout, void* permuteBuffer) diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp index 06d2eccf3e..d2f9ca5862 100644 --- a/src/backends/backendsCommon/WorkloadUtils.hpp +++ b/src/backends/backendsCommon/WorkloadUtils.hpp @@ -214,8 +214,42 @@ void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout); TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout); +/// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M] +/// This function coverts a TensorInfo from [1,H,W,I*M] to [1,I*M,H,W] (if NCHW) or keeps it at [1,H,W,I*M] (if NHWC) +/// as required by the compute library +/// Returns a tuple of converted weights tensor info and depth multiplier +std::tuple Convert1HWOTensorInfoToAcl(const TensorInfo& weightInfo, + const TensorInfo& inputInfo, + const DataLayout dataLayout); + armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstTensorHandle* weightTensor, DataLayout dataLayout, void* permuteBuffer); +/// Weights for depthwise have a datalayout of [1,H,W,O] = [1,H,W,I*M] +/// This function coverts a ConstCpuTensorHandle from [1,H,W,I*M] to [1,I*M,H,W] (if NCHW) or +/// keeps it at [1,H,W,I*M] (if NHWC) as required by the compute library +/// +/// \param weightTensor - ConstTensorHandle of weights tensor +/// \param inputInfo - TensorInfo of input tensor +/// \param dataLayout - DataLayout of the input tensor +/// \param permuteBuffer - Pointer to memory with the size of tensor. Used for the permutation +/// \return tuple of transformed weights-ConstTensor and depthwise multiplier +std::tuple Convert1HWOTensorToAcl(const ConstTensorHandle* weightTensor, + const TensorInfo& inputInfo, + const DataLayout dataLayout, + void* permuteBuffer); + +/// Converts a (weights) tensor from [1, H, W, I*M] = [1, H, W, O] to [M, I, H, W] +/// +/// \param weightTensor - ConstTensorHandle of the weight tensor that should be converted +/// \param inputInfo - TensorInfo of the corresponding input tensor +/// \param dataLayout - DataLayout of the input tensor e.g. NHWC or NCHW +/// \param permuteBuffer - Memory location with the same size as the weight tensor to write converted data to +/// \return - A tuple of ConstTensor and unsigned int which is the converted weightTensor and the depthMultiplier +std::tuple Convert1HWOtoMIHW(const ConstTensorHandle* weightTensor, + const TensorInfo& inputInfo, + const DataLayout& dataLayout, + void* permuteBuffer); + } //namespace armnn diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp index 98264ee928..99f1436c98 100644 --- a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp +++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp @@ -1659,10 +1659,9 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( unsigned int inputChannels = armnn::numeric_cast(inputShape[1]); unsigned int inputHeight = armnn::numeric_cast(inputShape[2]); unsigned int inputWidth = armnn::numeric_cast(inputShape[3]); - unsigned int kernelChanMul = armnn::numeric_cast(kernelShape[0]); - unsigned int kernelChannels = armnn::numeric_cast(kernelShape[1]); - unsigned int kernelHeight = armnn::numeric_cast(kernelShape[2]); - unsigned int kernelWidth = armnn::numeric_cast(kernelShape[3]); + unsigned int kernelHeight = armnn::numeric_cast(kernelShape[1]); + unsigned int kernelWidth = armnn::numeric_cast(kernelShape[2]); + unsigned int kernelChannels = armnn::numeric_cast(kernelShape[3]); unsigned int outputNum = armnn::numeric_cast(outputExpectedShape[0]); unsigned int outputChannels = armnn::numeric_cast(outputExpectedShape[1]); unsigned int outputHeight = armnn::numeric_cast(outputExpectedShape[2]); @@ -1677,7 +1676,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType); - armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, ArmnnType); + armnn::TensorInfo kernelDesc({1, kernelHeight, kernelWidth, kernelChannels}, ArmnnType); armnn::TensorInfo biasDesc({static_cast(bias.size())}, ArmnnBType); // Set quantization parameters if the requested type is a quantized type. @@ -1792,19 +1791,17 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl( unsigned int kernelHeight = 3; unsigned int kernelWidth = 3; - unsigned int kernelChannels = inputChannels; - unsigned int kernelDepthMultiplier = 1; unsigned int outputHeight = 1; unsigned int outputWidth = 1; - unsigned int outputChannels = kernelChannels; + unsigned int outputChannels = inputChannels; unsigned int outputNum = inputNum; armnn::TensorInfo inputTensorInfo = armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType); - armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth}, + armnn::TensorInfo kernelDesc({1, kernelHeight, kernelWidth, outputChannels}, ArmnnType); armnn::TensorInfo biasDesc({ outputChannels }, ArmnnBType); @@ -1955,7 +1952,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl( inputBatchSize, inputChannels, inputHeight, inputWidth, layout, ArmnnType); armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo( outputBatchSize, outputChannels, outputHeight, outputWidth, layout, ArmnnType); - armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth}, + armnn::TensorInfo kernelDesc({1, kernelHeight, kernelWidth, outputChannels}, ArmnnType); armnn::TensorInfo biasDesc({outputChannels}, ArmnnBType); @@ -2040,33 +2037,18 @@ LayerTestResult DepthwiseConvolution2dTestImpl( // Manually calculated. std::vector originalOutputImage = std::vector( QuantizedVector({ - 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, - 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, - 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, - 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, - 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, 6.5f, - 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, - - -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, - -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, - - 8.0f, 8.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 10.0f, 10.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 10.0f, 10.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 10.0f, 10.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 10.0f, 10.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 8.0f, 8.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 5, 5, 5, 5, 5, 5, 5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, + 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5, 5, 5, 5, 5, 5, 5, + 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, + 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 1, 3, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, + 2, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, + 2, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, + 2, 4, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, + 3, 5, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, + 3, 5, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0 }, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset())); @@ -2170,10 +2152,9 @@ LayerTestResult DepthwiseConvolution2dTestImpl( unsigned int outputChannels = armnn::numeric_cast(originalOutputExpectedShape[1]); unsigned int outputNum = armnn::numeric_cast(originalOutputExpectedShape[0]); - unsigned int kernelHeight = armnn::numeric_cast(originalKernelShape[2]); - unsigned int kernelWidth = armnn::numeric_cast(originalKernelShape[3]); - unsigned int kernelChannels = armnn::numeric_cast(originalKernelShape[1]); - unsigned int kernelDepthMul = armnn::numeric_cast(originalKernelShape[0]); + unsigned int kernelHeight = armnn::numeric_cast(originalKernelShape[1]); + unsigned int kernelWidth = armnn::numeric_cast(originalKernelShape[2]); + unsigned int kernelChannels = armnn::numeric_cast(originalKernelShape[3]); bool biasEnabled = bias.size() > 0; @@ -2192,7 +2173,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl( armnnUtils::GetTensorInfo(2*outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType); // Kernel must be NCHW layout always, independently of the layout of the input and output for depthwise convolution. - armnn::TensorInfo kernelDesc({kernelDepthMul, kernelChannels, kernelHeight, kernelWidth}, ArmnnType); + armnn::TensorInfo kernelDesc({1, kernelHeight, kernelWidth, kernelChannels}, ArmnnType); armnn::TensorInfo biasDesc({static_cast(bias.size())}, ArmnnBType); @@ -2332,9 +2313,9 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon( inputTensorInfo.GetQuantizationOffset()); // Use a depth multiplier of 1 on a 2-channel 4x4 kernel. - armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, ArmnnType); - auto kernel = QuantizedVector( - { + // Weights layout for depthwise: [1,H,W,I*M] + armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2 }, ArmnnType); + auto kernel = QuantizedVector({ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, @@ -2353,17 +2334,10 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon( armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5 }, ArmnnType); auto expectedOutput = QuantizedVector( { - 1062, 1580, 1850, 1530, 1117, - 2140, 3108, 3500, 2842, 2042, - 3580, 5068, 5460, 4342, 3062, - 3618, 5072, 5390, 4248, 2971, - 3074, 4282, 4510, 3533, 2457, - - 1550, 2284, 2362, 1955, 1428, - 2910, 4206, 4342, 3528, 2536, - 3390, 4886, 5022, 4068, 2916, - 3566, 5056, 5182, 4133, 2922, - 3100, 4352, 4452, 3517, 2465 + 396, 664, 820, 756, 602, 1016, 1608, 1880, 1652, 1268, 1976, 2968, 3240, 2732, + 2028, 2628, 3808, 4060, 3312, 2390, 2596, 3700, 3900, 3130, 2226, 2817, 4186, + 4330, 3609, 2651, 5414, 7864, 8120, 6626, 4780, 6314, 9144, 9400, 7646, 5500, + 6759, 9610, 9850, 7875, 5579, 5935, 8348, 8540, 6757, 4742 }, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset()); @@ -2420,9 +2394,8 @@ LayerTestResult DepthwiseConvolution2dNhwcTestCommon( inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset()); - armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, ArmnnType); - auto kernel = QuantizedVector( - { + armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2 }, ArmnnType); + auto kernel = QuantizedVector({ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, @@ -2439,17 +2412,17 @@ LayerTestResult DepthwiseConvolution2dNhwcTestCommon( armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5}, ArmnnType); auto expectedOutput = QuantizedVector( { - 1062, 1580, 1850, 1530, 1117, - 2140, 3108, 3500, 2842, 2042, - 3580, 5068, 5460, 4342, 3062, - 3618, 5072, 5390, 4248, 2971, - 3074, 4282, 4510, 3533, 2457, - - 1550, 2284, 2362, 1955, 1428, - 2910, 4206, 4342, 3528, 2536, - 3390, 4886, 5022, 4068, 2916, - 3566, 5056, 5182, 4133, 2922, - 3100, 4352, 4452, 3517, 2465 + 396,664,820,756,602, + 1016,1608,1880,1652,1268, + 1976,2968,3240,2732,2028, + 2628,3808,4060,3312,2390, + 2596,3700,3900,3130,2226, + + 2817,4186,4330,3609,2651, + 5414,7864,8120,6626,4780, + 6314,9144,9400,7646,5500, + 6759,9610,9850,7875,5579, + 5935,8348,8540,6757,4742 }, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset()); @@ -2504,9 +2477,8 @@ LayerTestResult SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTestCommon( inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset()); - armnn::TensorInfo kernelTensorInfo({ 1, 1, 3, 3 }, ArmnnType); - auto kernel = QuantizedVector( - { + armnn::TensorInfo kernelTensorInfo({ 1, 3, 3, 1}, ArmnnType); + auto kernel = QuantizedVector({ 1, 2, 3, 4, 5, 6, 7, 8, 9 @@ -2671,7 +2643,7 @@ LayerTestResult DepthwiseConvolution2d3x3Dilation3x3Test( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - armnn::TensorInfo kernelTensorInfo({ 1, 1, 3, 3}, ArmnnType); + armnn::TensorInfo kernelTensorInfo({ 1, 3, 3, 1}, ArmnnType); std::vector kernelNoQuantizedValues = { 1, 2, 3, @@ -2740,7 +2712,7 @@ LayerTestResult DepthwiseConvolution2d2x3x3Dilation3x3Test( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - armnn::TensorInfo kernelTensorInfo({ 1, 2, 3, 3}, ArmnnType); + armnn::TensorInfo kernelTensorInfo({ 1, 3, 3, 2}, ArmnnType); std::vector kernelNoQuantizedValues = { 1, 2, 3, @@ -2757,15 +2729,9 @@ LayerTestResult DepthwiseConvolution2d2x3x3Dilation3x3Test( armnn::TensorInfo outputTensorInfo({ 1, 2, 4, 4}, ArmnnType); std::vector outputExpectedNoQuantizedValues = { - 6., 5., 5., 5., - 6., 5., 5., 5., - 6., 5., 5., 5., - 3., 2., 2., 2., + 2, 9, 9, 9, 2, 9, 9, 9, 2, 9, 9, 9, 5, 3, 3, 3, 3, - 6., 5., 5., 5., - 6., 5., 5., 5., - 6., 5., 5., 5., - 3., 2., 2., 2. + 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 6, 4, 4, 4 }; return DepthwiseConvolution2d3x3DilationTestCommon( @@ -2804,7 +2770,7 @@ LayerTestResult DepthwiseConvolution2dMult4Test( 27.0, 28.0, 29.0 }; - armnn::TensorInfo kernelTensorInfo({ 4, 2, 2, 2}, ArmnnType); + armnn::TensorInfo kernelTensorInfo({ 1, 2, 2, 8}, ArmnnType); std::vector kernelNoQuantizedValues = { @@ -2836,29 +2802,10 @@ LayerTestResult DepthwiseConvolution2dMult4Test( armnn::TensorInfo outputTensorInfo({ 1, 8, 2, 2}, ArmnnType); std::vector outputExpectedNoQuantizedValues = { - 10.f, 10.f, - 10.f, 10.f, - - 1.f, 1.f, - 1.f, 1.f, - - 2.f, 2.f, - 2.f, 2.f, - - 3.f, 3.f, - 3.f, 3.f, - - 23.f, 24.f, - 26.f, 27.f, - - 2.5f, 2.6000001f, - 2.8f, 2.9f, - - 4.2000003f, 4.4f, - 4.8f, 5.f, - - 6.6000004f, 6.9f, - 7.5000005f, 7.8f + 4.5f, 4.5f, 4.5f, 4.5f, 5.5f, 5.5f, 5.5f, 5.5f, + 2.5f, 2.5f, 2.5f, 2.5f, 3.5f, 3.5f, 3.5f, 3.5f, + 10.05f, 10.5f, 11.4f, 11.85f, 12.75f, 13.3f, 14.4f, 14.95f, + 5.25f, 5.5f, 6.0f, 6.25f, 7.45f, 7.8f, 8.5f, 8.85f }; @@ -2898,7 +2845,7 @@ LayerTestResult DepthwiseConvolution2dMult2Test( 27.0, 28.0, 29.0 }; - armnn::TensorInfo kernelTensorInfo({ 2, 2, 2, 2}, ArmnnType); + armnn::TensorInfo kernelTensorInfo({ 1, 2, 2, 4}, ArmnnType); std::vector kernelNoQuantizedValues = { @@ -2919,17 +2866,10 @@ LayerTestResult DepthwiseConvolution2dMult2Test( armnn::TensorInfo outputTensorInfo({ 1, 4, 2, 2}, ArmnnType); std::vector outputExpectedNoQuantizedValues = { - 10.f, 10.f, - 10.f, 10.f, - - 1.f, 1.f, - 1.f, 1.f, - - 4.2000003f, 4.4f, - 4.8f, 5.f, - - 6.6000004f, 6.9f, - 7.5000005f, 7.8f + 4.5f, 4.5f, 4.5f, 4.5f, + 5.5f, 5.5f, 5.5f, 5.5f, + 5.25f, 5.5f, 6.0f, 6.25f, + 7.65f, 8.0f, 8.7f, 9.05f }; @@ -2984,7 +2924,7 @@ LayerTestResult CompareDepthwiseConvolution2dTestImpl( std::vector inputShape; std::vector outputShape; - std::vector kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth }; + std::vector kernelShape{ 1, kernelHeight, kernelWidth, outputChannels }; std::vector biasShape{ outputChannels }; switch (layout.GetDataLayout()) { @@ -3609,6 +3549,14 @@ LayerTestResult DepthwiseConvolution2dDepthMul64Test( } armnn::TensorInfo kernelTensorInfo({ 64, 1, 2, 2 }, armnn::DataType::Float32); + // permute from [O,1,H,W] --> [1,H,W,O] + armnn::PermutationVector permutationVector {3,0,1,2}; + kernelTensorInfo = armnnUtils::Permuted(kernelTensorInfo, permutationVector); + std::vector kernelPermuted(kernelTensorInfo.GetNumElements()); + armnnUtils::Permute(kernelTensorInfo.GetShape(), permutationVector, + kernelData.data(), kernelPermuted.data(), + GetDataTypeSize(kernelTensorInfo.GetDataType())); + std::vector expectedOutputData(64, 0.f); armnn::TensorInfo outputTensorInfo({ 1, 64, 1, 1 }, armnn::DataType::Float32); @@ -3617,7 +3565,7 @@ LayerTestResult DepthwiseConvolution2dDepthMul64Test( memoryManager, tensorHandleFactory, input, - kernelData, + kernelPermuted, std::vector(), expectedOutputData, inputTensorInfo.GetShape(), @@ -3713,8 +3661,8 @@ LayerTestResult DepthwiseConvolution2dPerAxisQuantTest( TensorInfo outputInfo({ 1, 2, 2, 4 }, inputType, 1.0f, 128); // N H W C const std::vector quantScales{ 1.0f, 0.5f, 1.0f, 0.5f }; - const unsigned int quantDimension = 0; - TensorInfo kernelInfo({ 2, 2, 2, 2 }, kernelType, quantScales, quantDimension); // M I H W + const unsigned int quantDimension = 3; + TensorInfo kernelInfo({ 1, 2, 2, 4 }, kernelType, quantScales, quantDimension); // [1, H, W, I*M] const std::vector biasQuantScales{ 0.5f, 0.25f, 0.5f, 0.25f }; constexpr unsigned int biasQuantDimension = 0; diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp index 50cdb0a626..9a9977bd54 100644 --- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp +++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp @@ -33,12 +33,11 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); - // ArmNN's weight format is [ M, I, H, W ] - const unsigned int aclDepthMultiplier = weights.GetShape()[0]; - - // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either - // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library - TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout); + // ArmNN's weight format is usually [ M, I, H, W ] but for depthwise its [ 1, H, W, I*M] + // Permute to [ 1, I * M, H, W ] (if NCHW) as required by the compute library + unsigned int aclDepthMultiplier; + TensorInfo weightsPermuted; + std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input,descriptor.m_DataLayout); // Convert the weights into the compute library format const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); @@ -79,14 +78,15 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( const arm_compute::CLCompileContext& clCompileContext) : BaseWorkload(descriptor, info) { - // Allocate a buffer for the swizzling of the weight tensor + // ArmNN's weight format is usually [ M, I, H, W ] but for depthwise its [ 1, H, W, I*M] + // Permute to [ 1, I * M, H, W ] (if NCHW), as required by the compute library + ConstTensor weightPermuted; + unsigned int depthMultiplier; std::unique_ptr permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]); - - // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either - // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library - ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight, - m_Data.m_Parameters.m_DataLayout, - permuteBuffer.get()); + std::tie(weightPermuted, depthMultiplier) = Convert1HWOTensorToAcl(m_Data.m_Weight, + info.m_InputTensorInfos[0], + m_Data.m_Parameters.m_DataLayout, + permuteBuffer.get()); // Convert the weights into the compute library format m_KernelTensor = std::make_unique(); @@ -113,12 +113,6 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); - // ArmNN's weight format is [ M, I, H, W ] - auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - - // Get the depth multiplier - const unsigned int depthMultiplier = weightInfo.GetShape()[0]; - arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters); const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index edc8cb995c..62864f82dc 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -216,6 +216,11 @@ ARMNN_AUTO_TEST_CASE(DepthToSpaceNhwcInt16_3, DepthToSpaceTest3, DataLayout::NHWC); // Depthwise Convolution +ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2d, DepthwiseConvolution2dTest, true, DataLayout::NCHW) +ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dUint8, DepthwiseConvolution2dUint8Test, true, DataLayout::NCHW) + +ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2d, DepthwiseConvolution2dTest, false, DataLayout::NCHW) + ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dDepthMul1, DepthwiseConvolution2dDepthMul1Test, true, DataLayout::NCHW) ARMNN_AUTO_TEST_CASE_WITH_THF(UnbiasedDepthwiseConvolution2dDepthMul1, @@ -291,16 +296,15 @@ TensorInfo CreateOutputTensorInfo(const TensorInfo& inputInfo, unsigned int inHeight = inputShape[2]; unsigned int inBatchSize = inputShape[0]; - unsigned int filterWidth = filterShape[3]; + unsigned int filterWidth = filterShape[2]; unsigned int readWidth = (inWidth + descriptor.m_PadLeft + descriptor.m_PadRight) - (filterWidth); unsigned int outWidth = 1u + (readWidth / descriptor.m_StrideX); - unsigned int filterHeight = filterShape[2]; + unsigned int filterHeight = filterShape[1]; unsigned int readHeight = (inHeight + descriptor.m_PadTop + descriptor.m_PadBottom) - (filterHeight); unsigned int outHeight = 1u + (readHeight / descriptor.m_StrideY); - unsigned int depthMultiplier = filterShape[0]; - unsigned int outChannels = filterShape[1] * depthMultiplier; + unsigned int outChannels = filterShape[3]; unsigned int outBatchSize = inBatchSize; TensorShape outputShape({outBatchSize, outChannels, outHeight, outWidth}); @@ -314,7 +318,7 @@ TEST_CASE("DepthwiseConv2dUtils") TensorInfo inputInfo({1, 1, 10, 10 }, dataType); TensorInfo outputInfo; - TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, dataType); + TensorInfo weightsInfo3x3({ 1, 3, 3, 1 }, dataType); // [1,H,W,I*M] TensorInfo biasesInfo; DepthwiseConvolution2dDescriptor descriptor; @@ -380,7 +384,7 @@ TEST_CASE("DepthwiseConv2dUtils") weightsInfo1x1, biasesInfo)); // Supported shape 2x2 - TensorInfo weightsInfo2x2({ 1, 1, 2, 2 }, DataType::Float32); + TensorInfo weightsInfo2x2({ 1, 2, 2, 1 }, DataType::Float32); descriptor = MakeDepthwiseConv2dDesc(1, 1); outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo2x2, descriptor, dataType); CHECK(layerSupport.IsDepthwiseConvolutionSupported(inputInfo, outputInfo, descriptor, diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp index ad509076b4..589a951825 100644 --- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp +++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp @@ -36,12 +36,11 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); - // ArmNN's weight format is [ M, I, H, W ] - const unsigned int aclDepthMultiplier = weights.GetShape()[0]; - - // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either - // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library - TensorInfo weightsPermuted = ConvertWeightTensorInfoFromArmnnToAcl(weights, descriptor.m_DataLayout); + // ArmNN's weight format is usually [ M, I, H, W ] but for depthwise its [ 1, H, W, I*M] + // Permute to [ 1, I * M, H, W ] (if NCHW), as required by the compute library + unsigned int aclDepthMultiplier; + TensorInfo weightsPermuted; + std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input,descriptor.m_DataLayout); // Convert the weights into the compute library format const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); @@ -79,21 +78,20 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( const WorkloadInfo& info) : BaseWorkload(descriptor, info) { - // ArmNN's weight format is [ M, I, H, W ] + // ArmNN's weight format for depthwise is [ 1, H, W, I*M ] auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - // Allocate a buffer for the swizzling of the weight tensor - std::unique_ptr permuteBuffer(new unsigned char[m_Data.m_Weight->GetTensorInfo().GetNumBytes()]); - - // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either - // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library - ConstTensor weightPermuted = ConvertWeightTensorFromArmnnToAcl(m_Data.m_Weight, - m_Data.m_Parameters.m_DataLayout, - permuteBuffer.get()); + ConstTensor weightsPermuted; + unsigned int depthMultiplier; + std::unique_ptr permuteBuffer(new unsigned char[weightInfo.GetNumBytes()]); + std::tie(weightsPermuted, depthMultiplier) = Convert1HWOTensorToAcl(m_Data.m_Weight, + info.m_InputTensorInfos[0], + m_Data.m_Parameters.m_DataLayout, + permuteBuffer.get()); // Convert the weights into the compute library format m_KernelTensor = std::make_unique(); - BuildArmComputeTensor(*m_KernelTensor, weightPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout); + BuildArmComputeTensor(*m_KernelTensor, weightsPermuted.GetInfo(), m_Data.m_Parameters.m_DataLayout); if (m_Data.m_Parameters.m_BiasEnabled) { @@ -116,9 +114,6 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); - // Get the depth multiplier - const unsigned int depthMultiplier = weightInfo.GetShape()[0]; - arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters); const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); @@ -136,7 +131,7 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( ARMNN_ASSERT(m_pDepthwiseConvolutionLayer); - ScopedTensorHandle weightsPermutedHandle(weightPermuted); + ScopedTensorHandle weightsPermutedHandle(weightsPermuted); InitializeArmComputeTensorData(*m_KernelTensor, &weightsPermutedHandle); if (m_Data.m_Parameters.m_BiasEnabled) diff --git a/src/backends/reference/test/CMakeLists.txt b/src/backends/reference/test/CMakeLists.txt index 76541cfdaa..d7c5da896a 100644 --- a/src/backends/reference/test/CMakeLists.txt +++ b/src/backends/reference/test/CMakeLists.txt @@ -13,6 +13,8 @@ list(APPEND armnnRefBackendUnitTests_sources RefLayerTests.cpp RefMemoryManagerTests.cpp RefOptimizedNetworkTests.cpp + RefPerAxisIteratorTests.cpp + RefPerChannelDecoderTests.cpp RefRuntimeTests.cpp RefTensorHandleTests.cpp RefWorkloadFactoryHelper.hpp diff --git a/src/backends/reference/test/RefPerAxisIteratorTests.cpp b/src/backends/reference/test/RefPerAxisIteratorTests.cpp new file mode 100644 index 0000000000..7da4c0fb0f --- /dev/null +++ b/src/backends/reference/test/RefPerAxisIteratorTests.cpp @@ -0,0 +1,252 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include + +#include +#include + + +template +void CompareVector(std::vector vec1, std::vector vec2) +{ + BOOST_TEST(vec1.size() == vec2.size()); + + bool mismatch = false; + for (uint i = 0; i < vec1.size(); ++i) + { + if (vec1[i] != vec2[i]) + { + /*std::stringstream ss; + ss << "Vector value mismatch: index=" << i << " " << vec1[i] << "!=" << vec2[i];*/ + BOOST_TEST_MESSAGE(fmt::format("Vector value mismatch: index={} {} != {}", + i, + vec1[i], + vec2[i])); + mismatch = true; + } + } + + if (mismatch) + { + BOOST_FAIL("Error in CompareVector. Vectors don't match."); + } +} + +using namespace armnn; + +// Basically a per axis decoder but without any decoding/quantization +class MockPerAxisIterator : public PerAxisIterator> +{ +public: + MockPerAxisIterator(const int8_t* data, const armnn::TensorShape& tensorShape, const unsigned int axis) + : PerAxisIterator(data, tensorShape, axis), m_NumElements(tensorShape.GetNumElements()) + {} + + int8_t Get() const override + { + return *m_Iterator; + } + + virtual std::vector DecodeTensor(const TensorShape &tensorShape, + bool isDepthwise = false) override + { + IgnoreUnused(tensorShape, isDepthwise); + return std::vector{}; + }; + + // Iterates over data using operator[] and returns vector + std::vector Loop() + { + std::vector vec; + for (uint32_t i = 0; i < m_NumElements; ++i) + { + this->operator[](i); + vec.emplace_back(Get()); + } + return vec; + } + + unsigned int GetAxisIndex() + { + return m_AxisIndex; + } + unsigned int m_NumElements; +}; + +BOOST_AUTO_TEST_SUITE(RefPerAxisIterator) + +// Test Loop (Equivalent to DecodeTensor) and Axis = 0 +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest1) +{ + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8); + + // test axis=0 + std::vector expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 0); + std::vector output = iterator.Loop(); + CompareVector(output, expOutput); + + // Set iterator to index and check if the axis index is correct + iterator[5]; + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator[1]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[10]; + BOOST_TEST(iterator.GetAxisIndex() == 2u); +} + +// Test Axis = 1 +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest2) +{ + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8); + + // test axis=1 + std::vector expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 1); + std::vector output = iterator.Loop(); + CompareVector(output, expOutput); + + // Set iterator to index and check if the axis index is correct + iterator[5]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[1]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[10]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); +} + +// Test Axis = 2 +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest3) +{ + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8); + + // test axis=2 + std::vector expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 2); + std::vector output = iterator.Loop(); + CompareVector(output, expOutput); + + // Set iterator to index and check if the axis index is correct + iterator[5]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[1]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[10]; + BOOST_TEST(iterator.GetAxisIndex() == 1u); +} + +// Test Axis = 3 +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest4) +{ + std::vector input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8); + + // test axis=3 + std::vector expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 3); + std::vector output = iterator.Loop(); + CompareVector(output, expOutput); + + // Set iterator to index and check if the axis index is correct + iterator[5]; + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator[1]; + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator[10]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); +} + + +// Test Axis = 1. Different tensor shape +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest5) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15 + }; + + std::vector expOutput = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15 + }; + + TensorInfo tensorInfo ({2,2,2,2},DataType::QSymmS8); + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 1); + std::vector output = iterator.Loop(); + CompareVector(output, expOutput); + + // Set iterator to index and check if the axis index is correct + iterator[5]; + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator[1]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator[10]; + BOOST_TEST(iterator.GetAxisIndex() == 0u); +} + +// Test the increment and decrement operator +BOOST_AUTO_TEST_CASE(PerAxisIteratorTest7) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11 + }; + + std::vector expOutput = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11 + }; + + TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8); + auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 2); + + iterator += 3; + BOOST_TEST(iterator.Get(), expOutput[3]); + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator += 3; + BOOST_TEST(iterator.Get(), expOutput[6]); + BOOST_TEST(iterator.GetAxisIndex() == 1u); + + iterator -= 2; + BOOST_TEST(iterator.Get(), expOutput[4]); + BOOST_TEST(iterator.GetAxisIndex() == 0u); + + iterator -= 1; + BOOST_TEST(iterator.Get(), expOutput[3]); + BOOST_TEST(iterator.GetAxisIndex() == 1u); +} + + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/src/backends/reference/test/RefPerChannelDecoderTests.cpp b/src/backends/reference/test/RefPerChannelDecoderTests.cpp new file mode 100644 index 0000000000..c2e3cee7a0 --- /dev/null +++ b/src/backends/reference/test/RefPerChannelDecoderTests.cpp @@ -0,0 +1,156 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include + +#include + +BOOST_AUTO_TEST_SUITE(RefPerChannelDecoder) + +template +void CompareVector(std::vector vec1, std::vector vec2) +{ + BOOST_TEST(vec1.size() == vec2.size()); + + bool mismatch = false; + for (uint i = 0; i < vec1.size(); ++i) + { + if (vec1[i] != vec2[i]) + { + /*std::stringstream ss; + ss << "Vector value mismatch: index=" << i << " " << vec1[i] << "!=" << vec2[i];*/ + BOOST_TEST_MESSAGE(fmt::format("Vector value mismatch: index={} {} != {}", + i, + vec1[i], + vec2[i])); + mismatch = true; + } + } + + if (mismatch) + { + BOOST_FAIL("Error in CompareVector. Vectors don't match."); + } +} + +// Ensure quantization works for none depthwise convolutions +BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest1) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 + }; + + std::vector expOutput = + { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, + 24.0f, 26.0f, 28.0f, 30.0f, 32.0f, 34.0f, 36.0f, 38.0f, 40.0f, 42.0f, 44.0f, 46.0f + }; + + TensorInfo tensorInfo ({2,2,2,3},DataType::QSymmS8,{1.0f, 2.0f},0); + auto decoder = MakeDecoder(tensorInfo, input.data()); + + std::vector output = decoder->DecodeTensor(tensorInfo.GetShape()); + + CompareVector(output, expOutput); +} + +// Ensure quantization works for depthwise convolutions M=1 +BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest2) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + + std::vector expOutput = + { + 0.0f, 1.0f, 2.0f, 3.0f, + 8.0f, 10.0f, 12.0f, 14.0f, + 24.0f, 27.0f, 30.0f, 33.0f, + 48.0f, 52.0f, 56.0f, 60.0f + }; + + // [O,1,H,W] = [I*M,1,H,W] = [4*1,1,2,2] + TensorInfo tensorInfo ({4,1,2,2},DataType::QSymmS8,{1.0f, 2.0f, 3.0f, 4.0f},0); + auto decoder = MakeDecoder(tensorInfo, input.data()); + + std::vector output = decoder->DecodeTensor(tensorInfo.GetShape(), true); + + CompareVector(output, expOutput); +} + +// Ensure quantization works for depthwise convolutions M=2 +BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest3) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 17, 18, 19, + 20, 21, 22, 23 + }; + + std::vector expOutput = + { + 0.0f, 1.0f, 2.0f, 3.0f, + 8.0f, 10.0f, 12.0f, 14.0f, + 24.0f, 27.0f, 30.0f, 33.0f, + 48.0f, 52.0f, 56.0f, 60.0f, + 80.0f, 85.0f, 90.0f, 95.0f, + 120.0f, 126.0f, 132.0f, 138.0f + }; + + // [O,1,H,W] = [I*M,1,H,W] = [3*2,1,2,2] + TensorInfo tensorInfo ({6,1,2,2},DataType::QSymmS8,{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},0); + auto decoder = MakeDecoder(tensorInfo, input.data()); + + std::vector output = decoder->DecodeTensor(tensorInfo.GetShape(), true); + + CompareVector(output, expOutput); +} + +// Ensure quantization works for depthwise convolutions M=2 for int32 +BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest4) +{ + using namespace armnn; + std::vector input = + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 17, 18, 19, + 20, 21, 22, 23 + }; + + std::vector expOutput = + { + 0.0f, 1.0f, 2.0f, 3.0f, + 8.0f, 10.0f, 12.0f, 14.0f, + 24.0f, 27.0f, 30.0f, 33.0f, + 48.0f, 52.0f, 56.0f, 60.0f, + 80.0f, 85.0f, 90.0f, 95.0f, + 120.0f, 126.0f, 132.0f, 138.0f + }; + + // [O,1,H,W] = [I*M,1,H,W] = [3*2,1,2,2] + TensorInfo tensorInfo ({6,1,2,2},DataType::Signed32,{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},0); + auto decoder = MakeDecoder(tensorInfo, input.data()); + + std::vector output = decoder->DecodeTensor(tensorInfo.GetShape(), true); + + CompareVector(output, expOutput); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/reference/workloads/BaseIterator.hpp b/src/backends/reference/workloads/BaseIterator.hpp index 73e24691d9..483ef720f9 100644 --- a/src/backends/reference/workloads/BaseIterator.hpp +++ b/src/backends/reference/workloads/BaseIterator.hpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include @@ -22,8 +24,6 @@ public: virtual ~BaseIterator() {} - virtual BaseIterator& SetIndex(unsigned int index, unsigned int axisIndex = 0) = 0; - virtual BaseIterator& operator++() = 0; virtual BaseIterator& operator+=(const unsigned int increment) = 0; @@ -47,7 +47,6 @@ public: virtual std::vector DecodeTensor(const TensorShape &tensorShape, - const unsigned int channelMultiplier = 1, bool isDepthwise = false) = 0; }; @@ -108,14 +107,6 @@ public: return *this; } - TypedIterator& SetIndex(unsigned int index, unsigned int axisIndex = 0) override - { - IgnoreUnused(axisIndex); - ARMNN_ASSERT(m_Iterator); - m_Iterator = m_Start + index; - return *this; - } - protected: T* m_Iterator; T* m_Start; @@ -135,10 +126,9 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -173,10 +163,9 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -211,10 +200,9 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -249,10 +237,9 @@ public: return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset); } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -289,10 +276,9 @@ public: return val; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -328,10 +314,9 @@ public: return val; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -365,10 +350,9 @@ public: return *m_Iterator; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -393,10 +377,9 @@ public: return static_cast(*m_Iterator) * m_Scale; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -430,10 +413,9 @@ public: return static_cast(*m_Iterator); } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -463,10 +445,9 @@ public: return *m_Iterator; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -496,10 +477,9 @@ public: return *m_Iterator; } std::vector DecodeTensor (const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -530,10 +510,9 @@ public: } std::vector DecodeTensor(const TensorShape& tensorShape, - const unsigned int channelMultiplier, const bool isDepthwise) override { - IgnoreUnused(channelMultiplier, isDepthwise); + IgnoreUnused(isDepthwise); const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; @@ -769,23 +748,33 @@ public: } }; -// PerAxisIterator for per-axis quantization +/// PerAxisIterator for per-axis quantization. Iterates over a tensor as layed out in memory and keeps track +/// of the axis index. template class PerAxisIterator : public Base { public: - // axisFactor is used to calculate channelStep - PerAxisIterator(T* data = nullptr, unsigned int axisFactor = 0) - : m_Iterator(data), m_Start(data), m_AxisIndex(0), m_AxisFactor(axisFactor) + PerAxisIterator(T* data = nullptr, + unsigned int axisFactor = 0, + unsigned int axisDimensionality=0) + : m_Iterator(data), + m_Start(data), + m_AxisIndex(0), // iterates over the dimension of axis + m_AxisDimensionality(axisDimensionality), // tensorShape[quantization_dim] + m_AxisFactor(axisFactor), + m_Index(0) {} - // This should be called to set index for per-axis Encoder/Decoder - PerAxisIterator& SetIndex(unsigned int index, unsigned int axisIndex) override + PerAxisIterator(T* data = nullptr, + const armnn::TensorShape& tensorShape = TensorShape(), + const unsigned int axis = 0) + : m_Iterator(data), + m_Start(data), + m_AxisIndex(0), + m_Index(0) { - ARMNN_ASSERT(m_Iterator); - m_Iterator = m_Start + index; - m_AxisIndex = axisIndex; - return *this; + m_AxisDimensionality = tensorShape[axis]; + m_AxisFactor = armnnUtils::GetNumElementsAfter(tensorShape, axis); } void Reset(void* data) override @@ -793,37 +782,50 @@ public: m_Iterator = reinterpret_cast(data); m_Start = m_Iterator; m_AxisIndex = 0; + m_Index = 0; } PerAxisIterator& operator++() override { - ARMNN_ASSERT(m_Iterator); - ++m_Iterator; - m_AxisIndex = static_cast(*m_Iterator) % m_AxisFactor; + ++m_Index; + this -> operator[](m_Index); return *this; } PerAxisIterator& operator+=(const unsigned int increment) override { - ARMNN_ASSERT(m_Iterator); - m_Iterator += increment; - m_AxisIndex = static_cast(*m_Iterator) % m_AxisFactor; + m_Index += increment; + this -> operator[](m_Index); return *this; } PerAxisIterator& operator-=(const unsigned int decrement) override { - ARMNN_ASSERT(m_Iterator); - m_Iterator -= decrement; - m_AxisIndex = static_cast(*m_Iterator) % m_AxisFactor; + m_Index -= decrement; + this -> operator[](m_Index); return *this; } - PerAxisIterator& operator[](const unsigned int index) override + + inline PerAxisIterator& SetIndexOnMem(const unsigned int index) { ARMNN_ASSERT(m_Iterator); m_Iterator = m_Start + index; - m_AxisIndex = static_cast(*m_Iterator) % m_AxisFactor; + if (index < m_AxisFactor) + { + m_AxisIndex = 0; + } + else + { + m_AxisIndex = (index / m_AxisFactor) % m_AxisDimensionality; + } + m_Index = index; + return *this; + } + + PerAxisIterator& operator[](const unsigned int index) override + { + SetIndexOnMem(index); return *this; } @@ -831,18 +833,22 @@ public: T* m_Iterator; T* m_Start; unsigned int m_AxisIndex; + unsigned int m_AxisDimensionality; // tensorShape[quantization_dim] unsigned int m_AxisFactor; + unsigned int m_Index; }; class QSymm8PerAxisDecoder : public PerAxisIterator> { public: - QSymm8PerAxisDecoder(const int8_t* data, const std::vector& scale, unsigned int axisFactor) - : PerAxisIterator(data, axisFactor), m_Scales(scale) {} + QSymm8PerAxisDecoder(const int8_t* data, const armnn::TensorInfo& tensorInfo) + : PerAxisIterator(data, tensorInfo.GetShape(), tensorInfo.GetQuantizationDim().value()), + m_Scales(tensorInfo.GetQuantizationScales()) + {} float Get() const override { - return armnn::Dequantize(*m_Iterator, m_Scales[m_AxisIndex], 0); + return armnn::Dequantize(*m_Iterator, GetScale(), 0); } // Get scale of the current value @@ -852,37 +858,18 @@ public: } std::vector DecodeTensor(const TensorShape &tensorShape, - const unsigned int channelMultiplier, bool isDepthwise) override { - const uint32_t size = tensorShape.GetNumElements(); - const uint32_t scaleSize = static_cast(m_Scales.size()); - - const uint32_t stepSize = isDepthwise ? - tensorShape[2] * tensorShape[3] : tensorShape.GetNumElements() / tensorShape[0]; - - const uint32_t stepNum = size / (stepSize * channelMultiplier); - uint32_t scale; + IgnoreUnused(isDepthwise); + const unsigned int size = tensorShape.GetNumElements(); std::vector decodedTensor; decodedTensor.reserve(size); - // channelMultiplier is only used in depthwise convolutions and in other cases will have no effect - // stepSize is the length of a contiguous area sharing a quantization scale within a tensor - // stepNum is the number of those steps/blocks in the tensor - for (uint32_t mult = 0; mult < channelMultiplier; ++mult) + for (uint32_t i = 0; i < size; ++i) { - for (uint32_t step = 0; step < stepNum; ++step) - { - scale = (channelMultiplier * step + mult) % scaleSize; - for (uint32_t i = 0; i < stepSize; ++i) - { - unsigned int index = mult * stepSize * channelMultiplier + - step * stepSize + i; - this->operator[](index); - decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0)); - } - } + SetIndexOnMem(i); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, GetScale(), 0)); } return decodedTensor; } @@ -920,8 +907,10 @@ private: class ScaledInt32PerAxisDecoder : public PerAxisIterator> { public: - ScaledInt32PerAxisDecoder(const int32_t* data, const std::vector& scales, unsigned int axisFactor) - : PerAxisIterator(data, axisFactor), m_Scales(scales) {} + ScaledInt32PerAxisDecoder(const int32_t* data, const armnn::TensorInfo tensorInfo) + : PerAxisIterator(data, tensorInfo.GetShape(), tensorInfo.GetQuantizationDim().value()), + m_Scales(tensorInfo.GetQuantizationScales()) + {} float Get() const override { @@ -935,17 +924,14 @@ public: } std::vector DecodeTensor(const TensorShape &tensorShape, - const unsigned int channelMultiplier, bool isDepthwise) override { const uint32_t size = tensorShape.GetNumElements(); - const uint32_t scaleSize = static_cast(m_Scales.size()); const uint32_t stepSize = isDepthwise ? tensorShape[2] * tensorShape[3] : tensorShape.GetNumElements() / tensorShape[0]; - const uint32_t stepNum = size / (stepSize * channelMultiplier); - uint32_t scale; + const uint32_t stepNum = size / stepSize; std::vector decodedTensor; decodedTensor.reserve(size); @@ -953,18 +939,14 @@ public: // channelMultiplier is only used in depthwise convolutions and in other cases will have no effect // stepSize is the length of a contiguous area sharing a quantization scale within a tensor // stepNum is the number of those steps/blocks in the tensor - for (uint32_t mult = 0; mult < channelMultiplier; ++mult) + for (uint32_t step = 0; step < stepNum; ++step) { - for (uint32_t step = 0; step < stepNum; ++step) + //scale = (channelMultiplier * step + mult) % scaleSize; + for (uint32_t i = 0; i < stepSize; ++i) { - scale = (channelMultiplier * step + mult) % scaleSize; - for (uint32_t i = 0; i < stepSize; ++i) - { - unsigned int index = mult * stepSize * channelMultiplier + - step * stepSize + i; - this->operator[](index); - decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0)); - } + unsigned int index = step * stepSize + i; + this->operator[](index); + decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[step], 0)); } } return decodedTensor; diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp index d7845535df..e1bbc6bc52 100644 --- a/src/backends/reference/workloads/ConvImpl.cpp +++ b/src/backends/reference/workloads/ConvImpl.cpp @@ -95,9 +95,12 @@ void Convolve(const TensorShape& rInputShape, const unsigned int heightIndex = dataLayoutIndexed.GetHeightIndex(); const unsigned int widthIndex = dataLayoutIndexed.GetWidthIndex(); - const unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1; - const unsigned int inputChannels = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex]; - const unsigned int outputChannels = depthwise ? inputChannels * depthMultiplier : rFilterShape[0]; + // Weights layout: + // Conv2d: [O,H,W,I] + // Depthwise: [1,H,W,O] + const unsigned int inputChannels = rInputShape[channelsIndex]; + const unsigned int outputChannels = rOutputShape[channelsIndex]; + const unsigned int depthMultiplier = depthwise ? outputChannels/inputChannels : 1; const unsigned int batchSize = rOutputShape[0]; const unsigned int outputHeight = rOutputShape[heightIndex]; @@ -105,16 +108,15 @@ void Convolve(const TensorShape& rInputShape, const unsigned int inputHeight = rInputShape[heightIndex]; const unsigned int inputWidth = rInputShape[widthIndex]; - const unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex]; - const unsigned int filterWidth = depthwise ? rFilterShape[3] : rFilterShape[widthIndex]; + const unsigned int filterHeight = depthwise ? rFilterShape[1] : rFilterShape[heightIndex]; + const unsigned int filterWidth = depthwise ? rFilterShape[2] : rFilterShape[widthIndex]; const std::vector inputVec = rInputDecoder.DecodeTensor(rInputShape); - const std::vector filterVec = rFilterDecoder.DecodeTensor(rFilterShape, depthMultiplier, depthwise); + const std::vector filterVec = rFilterDecoder.DecodeTensor(rFilterShape, depthwise); const TensorShape biasShape{outputChannels}; const std::vector biasVec = biasEnabled ? pBiasDecoder->DecodeTensor(biasShape) : std::vector(); - unsigned int depthwiseMultiplierIdx = 0; for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++) @@ -130,13 +132,6 @@ void Convolve(const TensorShape& rInputShape, // For normal, must loop over each input channel. for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++) { - if (depthwise) - { - depthwiseMultiplierIdx = 0; - cInput = cOutput / depthMultiplier; - depthwiseMultiplierIdx = cOutput % depthMultiplier; - } - for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++) { for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++) @@ -147,10 +142,10 @@ void Convolve(const TensorShape& rInputShape, // Since dimensionality of kernel depends on depthwiseness, so does index. if (depthwise) { - filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels + - cInput * filterWidth * filterHeight + - yFilter * filterWidth + - xFilter; + cInput = cOutput / depthMultiplier; + // filterDepth = outputChannels; + filterIndex = xFilter * outputChannels + cOutput + + yFilter * filterWidth * outputChannels; } else { diff --git a/src/backends/reference/workloads/Decoders.hpp b/src/backends/reference/workloads/Decoders.hpp index 0b3f36047d..cd0dc5d40f 100644 --- a/src/backends/reference/workloads/Decoders.hpp +++ b/src/backends/reference/workloads/Decoders.hpp @@ -20,11 +20,7 @@ namespace inline std::unique_ptr> MakeSigned32PerAxisDecoder(const TensorInfo& info, const void* data) { - auto params = armnnUtils::GetPerAxisParams(info); - return std::make_unique( - static_cast(data), - params.second, - params.first); + return std::make_unique(static_cast(data), info); } inline std::unique_ptr> MakeSigned32Decoder(const TensorInfo& info, const void* data) @@ -75,10 +71,7 @@ inline std::unique_ptr> MakeDecoder(const TensorInfo& info, const case armnn::DataType::QuantizedSymm8PerAxis: { std::pair> params = armnnUtils::GetPerAxisParams(info); - return std::make_unique( - static_cast(data), - params.second, - params.first); + return std::make_unique(static_cast(data), info); } ARMNN_NO_DEPRECATE_WARN_END case DataType::QAsymmS8: @@ -123,10 +116,7 @@ inline std::unique_ptr> MakeDecoder(const TensorInfo& info, const if (info.HasPerAxisQuantization()) { std::pair> params = armnnUtils::GetPerAxisParams(info); - return std::make_unique( - static_cast(data), - params.second, - params.first); + return std::make_unique(static_cast(data), info); } else { diff --git a/src/backends/reference/workloads/TransposeConvolution2d.cpp b/src/backends/reference/workloads/TransposeConvolution2d.cpp index 7408e92982..a1a6cbae68 100644 --- a/src/backends/reference/workloads/TransposeConvolution2d.cpp +++ b/src/backends/reference/workloads/TransposeConvolution2d.cpp @@ -137,7 +137,7 @@ void TransposeConvolution2dImpl(const TransposeConvolution2dDescriptor& descript { for (unsigned int dOutput = 0u; dOutput < outputDepth; ++dOutput) { - rBiasesDecoder.SetIndex(dOutput, dOutput); + rBiasesDecoder[dOutput]; for (unsigned int yOutput = 0u; yOutput < outputHeight; ++yOutput) { for (unsigned int xOutput = 0u; xOutput < outputWidth; ++xOutput) -- cgit v1.2.1