From 4b19d2249e3b8f9216ec5b410fad20c41b4c6053 Mon Sep 17 00:00:00 2001 From: Cathal Corbett Date: Wed, 11 May 2022 20:12:17 +0100 Subject: IVGCVSW-6940 ConstTensorsAsInput: DepthwiseConvolution2d - Complete ACL * Added backend specific optimization & test for CpuAcc and GpuAcc: PermuteDepthwiseConv2dWeights Signed-off-by: Cathal Corbett Change-Id: I600476b2e9c557a39818a574c1091c9d650b21b1 --- src/backends/backendsCommon/WorkloadData.cpp | 41 +++++---- src/backends/backendsCommon/WorkloadUtils.cpp | 4 +- .../test/layerTests/Conv2dTestImpl.cpp | 97 ++++++++++++++++++++-- 3 files changed, 114 insertions(+), 28 deletions(-) (limited to 'src/backends/backendsCommon') diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp index 7a46741964..289f780fba 100644 --- a/src/backends/backendsCommon/WorkloadData.cpp +++ b/src/backends/backendsCommon/WorkloadData.cpp @@ -1416,24 +1416,6 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa descriptorName, m_Parameters.m_StrideX, m_Parameters.m_StrideY)); } - const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3; - - // Expected weight shape: [ 1, H, W, I*M ] - This shape does NOT depend on the data layout - // inputChannels * channelMultiplier should be equal to outputChannels. - const unsigned int numWeightOutputChannels = weightTensorInfo.GetShape()[3]; // I*M=Cout - const unsigned int numOutputChannels = outputTensorInfo.GetShape()[channelIndex]; - if (numWeightOutputChannels != numOutputChannels) - { - throw InvalidArgumentException(fmt::format( - "{0}: The weight format in armnn is expected to be [1, H, W, Cout]." - "But 4th dimension is not equal to Cout. Cout = {1} Provided weight shape: [{2}, {3}, {4}, {5}]", - descriptorName, - numOutputChannels, - weightTensorInfo.GetShape()[0], - weightTensorInfo.GetShape()[1], - weightTensorInfo.GetShape()[2], - weightTensorInfo.GetShape()[3])); - } if (weightTensorInfo.GetShape()[0] != 1) { throw InvalidArgumentException(fmt::format( @@ -1446,6 +1428,29 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa weightTensorInfo.GetShape()[3])); } + const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3; + const unsigned int numWeightOutputChannelsRefFormat = weightTensorInfo.GetShape()[3]; + const unsigned int numWeightOutputChannelsAclFormat = weightTensorInfo.GetShape()[1]; + const unsigned int numOutputChannels = outputTensorInfo.GetShape()[channelIndex]; + + // Weights format has two valid options: [1, H, W, Cout] (CpuRef) or [1, Cout, H, W] (CpuAcc/GpuAcc). + bool validRefFormat = (numWeightOutputChannelsRefFormat == numOutputChannels); + bool validAclFormat = (numWeightOutputChannelsAclFormat == numOutputChannels); + + if (!(validRefFormat || validAclFormat)) + { + throw InvalidArgumentException(fmt::format( + "{0}: The weight format in armnn is expected to be [1, H, W, Cout] (CpuRef) or [1, Cout, H, W] " + "(CpuAcc/GpuAcc). But neither the 4th (CpuRef) or 2nd (CpuAcc/GpuAcc) dimension is equal to Cout." + "Cout = {1} Provided weight shape: [{2}, {3}, {4}, {5}]", + descriptorName, + numOutputChannels, + weightTensorInfo.GetShape()[0], + weightTensorInfo.GetShape()[1], + weightTensorInfo.GetShape()[2], + weightTensorInfo.GetShape()[3])); + } + ValidateWeightDataType(inputTensorInfo, weightTensorInfo, descriptorName); Optional optionalBiasTensorInfo; diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp index d2ae16af0c..b045530abc 100644 --- a/src/backends/backendsCommon/WorkloadUtils.cpp +++ b/src/backends/backendsCommon/WorkloadUtils.cpp @@ -175,12 +175,14 @@ std::tuple Convert1HWOTensorInfoToAcl(const TensorInfo TensorInfo weightsPermuted; if (dataLayout == armnn::DataLayout::NHWC) { - // No permutation required. Data layouts are the same. + // No permutation required. Input and weights data layouts are the same. aclDepthMultiplier = weightInfo.GetShape()[3] / inputInfo.GetShape()[3]; weightsPermuted = weightInfo; } + else if (dataLayout == armnn::DataLayout::NCHW) { + // Weights permutation required. Weights [N,H,W,C] and input [N,C,H,W] data layouts are different. // [ 1, H, W, I*M] --> [ 1, I * M, H, W ] aclDepthMultiplier = weightInfo.GetShape()[3] / inputInfo.GetShape()[1]; PermutationVector permutationVector{ 0, 2, 3, 1 }; diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp index 4203fed23a..74c65e271c 100644 --- a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp +++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp @@ -1713,6 +1713,20 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( inputData = tmp; } + std::vector kernelData; + kernelData.assign(kernel.data(), kernel.data() + kernelHeight * kernelWidth * outputChannels); + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernelData.size()); + kernelDesc.SetShape(armnnUtils::Permuted(kernelDesc.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelDesc.GetShape(), {0, 2, 3, 1}, kernelData.data(), tmp.data(), sizeof(T)); + kernelData = tmp; + } + } + // Construct the output data, with bias applied, as appropriate. std::vector outputData; outputData.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth); @@ -1751,8 +1765,8 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl( // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method. // Cannot PolymorphicDowncast from ScopedTensorHandle->RefTensorHandle. // Need to PolymorphicDowncast from ITensorHandle->RefTensorHandle. - AllocateAndCopyDataToITensorHandle(&weightsTensor, kernel.data()); - AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernel.data()); // required for ConstantTensor + AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data()); + AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernelData.data()); // required for ConstantTensor AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); AddInputToWorkload(data, info, kernelDesc, weightsHandle.get()); @@ -1881,6 +1895,18 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl( kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset())); + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernelData.size()); + kernelDesc.SetShape(armnnUtils::Permuted(kernelDesc.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelDesc.GetShape(), {0, 2, 3, 1}, kernelData.data(), tmp.data(), sizeof(T)); + kernelData = tmp; + } + } + // Manually calculated. std::vector outputImage( QuantizedVector({ 0.f, 0.f }, @@ -2077,6 +2103,18 @@ LayerTestResult DepthwiseConvolution2dTestImpl( kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset())); + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernelData.size()); + kernelDesc.SetShape(armnnUtils::Permuted(kernelDesc.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelDesc.GetShape(), {0, 2, 3, 1}, kernelData.data(), tmp.data(), sizeof(T)); + kernelData = tmp; + } + } + // Manually calculated. std::vector originalOutputImage = std::vector( QuantizedVector({ @@ -2251,6 +2289,20 @@ LayerTestResult DepthwiseConvolution2dTestImpl( biasDesc.SetQuantizationOffset(0); } + std::vector kernelData; + kernelData.assign(originalKernel.data(), originalKernel.data() + kernelHeight*kernelWidth*outputChannels); + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernelData.size()); + kernelDesc.SetShape(armnnUtils::Permuted(kernelDesc.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelDesc.GetShape(), {0, 2, 3, 1}, kernelData.data(), tmp.data(), sizeof(T)); + kernelData = tmp; + } + } + // Construct input data std::vector input; input.assign(originalInput.data(), originalInput.data() + 1*inputChannels*inputHeight*inputWidth); @@ -2309,8 +2361,8 @@ LayerTestResult DepthwiseConvolution2dTestImpl( // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons. // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight). // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method. - AllocateAndCopyDataToITensorHandle(&weightsTensor, originalKernel.data()); // required for QueueDescriptor - AllocateAndCopyDataToITensorHandle(weightsHandle.get(), originalKernel.data()); // required for ConstantTensor + AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data()); // required for QueueDescriptor + AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernelData.data()); // required for ConstantTensor AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); AddInputToWorkload(data, info, kernelDesc, weightsHandle.get()); @@ -3029,22 +3081,37 @@ LayerTestResult CompareDepthwiseConvolution2dTestImpl( auto kernel = MakeRandomTensor(kernelDesc, 891234, 0.0f, 255.0f); auto bias = MakeRandomTensor::Type>(biasDesc, 1028, 0.0f, 255.0f); + armnn::TensorInfo aclKernelDescriptor = kernelDesc; + std::vector aclKernelData; + aclKernelData.assign(kernel.data(), kernel.data() + kernelHeight * kernelWidth * outputChannels); + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernel.size()); + aclKernelDescriptor.SetShape(armnnUtils::Permuted(kernelDesc.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelDesc.GetShape(), {0, 2, 3, 1}, kernel.data(), tmp.data(), sizeof(T)); + aclKernelData = tmp; + } + } + std::vector actualOutput(outputTensorInfo.GetNumElements()); std::vector expectedOutput(outputTensorInfo.GetNumElements()); std::unique_ptr inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo); - std::unique_ptr weightsHandle = tensorHandleFactory.CreateTensorHandle(kernelDesc); + std::unique_ptr weightsHandle = tensorHandleFactory.CreateTensorHandle(aclKernelDescriptor); std::unique_ptr biasHandle = tensorHandleFactory.CreateTensorHandle(biasDesc); std::unique_ptr outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo); armnn::DepthwiseConvolution2dQueueDescriptor data; armnn::WorkloadInfo info; - armnn::ScopedTensorHandle weightsTensor(kernelDesc); + armnn::ScopedTensorHandle weightsTensor(aclKernelDescriptor); armnn::ScopedTensorHandle biasTensor(biasDesc); AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); - AddInputToWorkload(data, info, kernelDesc, weightsHandle.get()); + AddInputToWorkload(data, info, aclKernelDescriptor, weightsHandle.get()); AddInputToWorkload(data, info, biasDesc, biasHandle.get()); AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); @@ -3052,8 +3119,8 @@ LayerTestResult CompareDepthwiseConvolution2dTestImpl( // See comment in DepthwiseConvolution2dAsymmetricTestImpl() for reasons. // 1) ScopedTensorHandle (weightsTensor) required for QueueDescriptor (data.m_Weight). // 2) ITensorHandle (converts to Backend TensorHandle) required in RefWorkload for GetTensorInfo() method. - AllocateAndCopyDataToITensorHandle(weightsHandle.get(), kernel.data()); - AllocateAndCopyDataToITensorHandle(&weightsTensor, kernel.data()); + AllocateAndCopyDataToITensorHandle(weightsHandle.get(), aclKernelData.data()); + AllocateAndCopyDataToITensorHandle(&weightsTensor, aclKernelData.data()); AllocateAndCopyDataToITensorHandle(biasHandle.get(), bias.data()); AllocateAndCopyDataToITensorHandle(&biasTensor, bias.data()); @@ -3788,6 +3855,18 @@ LayerTestResult DepthwiseConvolution2dPerAxisQuantTest( 1, 1, 1, 1 }; + if (workloadFactory.GetBackendId() == armnn::BackendId("GpuAcc") || + workloadFactory.GetBackendId() == armnn::BackendId("CpuAcc")) + { + if (layout == armnn::DataLayout::NCHW) + { + std::vector tmp(kernelData.size()); + kernelInfo.SetShape(armnnUtils::Permuted(kernelInfo.GetShape(), {0, 2, 3, 1})); + armnnUtils::Permute(kernelInfo.GetShape(), {0, 2, 3, 1}, kernelData.data(), tmp.data(), sizeof(int8_t)); + kernelData = tmp; + } + } + std::vector biasData = { 4, 4, 4, 4 -- cgit v1.2.1