From fbfa49eeb14c6cb94d47e3c770b0c168e818cf79 Mon Sep 17 00:00:00 2001 From: Tianle Cheng Date: Tue, 23 Jan 2024 11:21:48 +0000 Subject: IVGCVSW-7571 GpuFsa Op: Add Depthwise Conv2d * Added DepthwiseConv2d support for GpuFsa backend. * Updated DepthwiseConv2d End-to-End test Signed-off-by: Tianle Cheng Change-Id: I646839980d138ae235a00990c97c6e66a4418a5e --- .../test/DepthwiseConvolution2dEndToEndTests.hpp | 88 +++++---- src/backends/gpuFsa/GpuFsaBackend.cpp | 26 +++ src/backends/gpuFsa/GpuFsaLayerSupport.cpp | 29 +++ src/backends/gpuFsa/layers/CMakeLists.txt | 2 + src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp | 1 - .../gpuFsa/layers/GpuFsaDepthwiseConvolution2d.cpp | 210 +++++++++++++++++++++ .../gpuFsa/layers/GpuFsaDepthwiseConvolution2d.hpp | 30 +++ src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp | 8 + src/backends/reference/test/RefEndToEndTests.cpp | 7 + 9 files changed, 362 insertions(+), 39 deletions(-) create mode 100644 src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.cpp create mode 100644 src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.hpp diff --git a/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp b/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp index 1f9b60a4f2..a2c369b692 100644 --- a/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp +++ b/src/backends/backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022, 2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -67,34 +67,34 @@ void DepthwiseConvolution2dEndToEnd(const std::vector& backend unsigned int kernelWidth = 3; unsigned int outputHeight = inputHeight - kernelHeight + 1 + 2; - unsigned int outputWidth = (inputWidth - kernelWidth + 1)/2; + unsigned int outputWidth = inputWidth - kernelWidth + 1; unsigned int outputChannels = inputChannels * depthMultiplier; unsigned int outputBatchSize = inputBatchSize; - TensorInfo inputInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth }, ArmnnType, qScale, qOffset, true); - TensorInfo outputInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth }, ArmnnType, qScale, qOffset); - TensorInfo weightsInfo({1, kernelHeight, kernelWidth, outputChannels}, ArmnnType, qScale, qOffset, true); - TensorInfo biasesInfo({outputChannels}, ArmnnBType, qScale * qScale, 0, true); + TensorInfo inputInfo({ inputBatchSize, inputHeight, inputWidth, inputChannels }, ArmnnType, qScale, qOffset, true); + TensorInfo outputInfo({ outputBatchSize, outputHeight, outputWidth, outputChannels }, ArmnnType, qScale, qOffset); + TensorInfo weightsInfo({ 1, kernelHeight, kernelWidth, outputChannels }, ArmnnType, qScale, qOffset, true); + TensorInfo biasesInfo({ outputChannels }, ArmnnBType, qScale * qScale, 0, true); std::vector inputData = { - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f - }; + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 1.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, + 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f, 0.5f, 0.0f + }; std::vector weightsData = { @@ -127,31 +127,43 @@ void DepthwiseConvolution2dEndToEnd(const std::vector& backend std::vector expectedOutputData = { - 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, - 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, - 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.5f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, - 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 2.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, - 4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 4.5f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, - 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, - 1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 2.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f - }; + 3.0f, 4.5f, 2.0f, 1.0f, 3.0f, 4.5f, 3.0f, 1.0f, 3.0f, 4.5f, 4.0f, 3.0f, 3.0f, 4.5f, + 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, + 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, + 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, 3.0f, 4.5f, 1.0f, -1.0f, + 3.0f, 5.5f, 3.0f, 2.0f, 3.0f, 5.5f, 4.0f, 2.0f, 3.0f, 5.5f, 5.0f, 4.0f, 3.0f, 5.5f, + 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, + 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, + 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, 3.0f, 5.5f, 1.0f, -1.0f, + 5.0f, 6.5f, 3.0f, 2.0f, 5.0f, 6.5f, 4.0f, 2.0f, 5.0f, 6.5f, 5.0f, 4.0f, 5.0f, 6.5f, + 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, + 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, + 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, 5.0f, 6.5f, 1.0f, -1.0f, + 5.5f, 8.0f, 3.0f, 2.0f, 5.5f, 8.0f, 4.0f, 2.0f, 5.5f, 8.0f, 5.0f, 4.0f, 5.5f, 8.0f, + 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, + 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, + 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, + 5.5f, 8.0f, 3.0f, 2.0f, 5.5f, 8.0f, 4.0f, 2.0f, 5.5f, 8.0f, 5.0f, 4.0f, 5.5f, 8.0f, + 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, + 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, + 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, 5.5f, 8.0f, 1.0f, -1.0f, + 5.0f, 8.0f, 3.0f, 2.0f, 5.0f, 8.0f, 4.0f, 2.0f, 5.0f, 8.0f, 5.0f, 4.0f, 5.0f, 8.0f, + 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, + 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, + 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f, 5.0f, 8.0f, 1.0f, -1.0f + }; DepthwiseConvolution2dDescriptor descriptor; descriptor.m_PadLeft = 0; descriptor.m_PadRight = 0; descriptor.m_PadTop = 1; - descriptor.m_PadBottom = 0; - descriptor.m_StrideX = 2; + descriptor.m_PadBottom = 1; + descriptor.m_StrideX = 1; descriptor.m_StrideY = 1; descriptor.m_BiasEnabled = true; descriptor.m_DataLayout = dataLayout; - // Permute input and output if NCDHW. + // Permute input if NCHW, the original input and output are in NHWC format. if (dataLayout == DataLayout::NCHW) { PermuteTensorNhwcToNchw(inputInfo, inputData); diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp index 9886a6e187..e80369965b 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.cpp +++ b/src/backends/gpuFsa/GpuFsaBackend.cpp @@ -21,6 +21,7 @@ #include #include "layers/GpuFsaConvolution2d.hpp" +#include "layers/GpuFsaDepthwiseConvolution2d.hpp" namespace armnn { @@ -268,6 +269,31 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra } break; } + case (LayerType::DepthwiseConvolution2d): + { + auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); + + auto desc = PolymorphicDowncast(&base.GetParameters()); + if (desc->m_BiasEnabled) + { + auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo(); + GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr, + input, + *desc, + weights, + bias); + } + else + { + GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr, + input, + *desc, + weights, + EmptyOptional()); + } + break; + } default: // unsupported layer for GpuFsa backend continue; diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp index 96c986ab33..18c9ac8f5b 100644 --- a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp +++ b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp @@ -11,6 +11,7 @@ #if defined(ARMCOMPUTEGPUFSA_ENABLED) #include "layers/GpuFsaConvolution2d.hpp" +#include "layers/GpuFsaDepthwiseConvolution2d.hpp" #endif #include @@ -98,6 +99,34 @@ bool GpuFsaLayerSupport::IsLayerSupported(const LayerType& type, infos[3]); } } + case LayerType::DepthwiseConvolution2d: + { + if (infos.size() != 4) + { + throw InvalidArgumentException("Invalid number of DepthwiseConvolution2dDescriptor TensorInfos. " + "TensorInfos should be of format: {input, output, weights, biases}."); + } + + auto desc = *(PolymorphicDowncast(&descriptor)); + if (infos[3] == TensorInfo()) + { + FORWARD_LAYER_VALIDATE_FUNC(GpuFsaDepthwiseConvolution2dValidate, + reasonIfUnsupported, + infos[0], + desc, + infos[2], + EmptyOptional()); + } + else + { + FORWARD_LAYER_VALIDATE_FUNC(GpuFsaDepthwiseConvolution2dValidate, + reasonIfUnsupported, + infos[0], + desc, + infos[2], + infos[3]); + } + } case LayerType::Constant: case LayerType::Input: case LayerType::Output: diff --git a/src/backends/gpuFsa/layers/CMakeLists.txt b/src/backends/gpuFsa/layers/CMakeLists.txt index 3a02ce1a77..c174c51640 100644 --- a/src/backends/gpuFsa/layers/CMakeLists.txt +++ b/src/backends/gpuFsa/layers/CMakeLists.txt @@ -6,6 +6,8 @@ list(APPEND armnnGpuFsaBackendLayerValidators_sources GpuFsaConvolution2d.cpp GpuFsaConvolution2d.hpp + GpuFsaDepthwiseConvolution2d.cpp + GpuFsaDepthwiseConvolution2d.hpp ) add_library(armnnGpuFsaBackendLayerValidators OBJECT ${armnnGpuFsaBackendLayerValidators_sources}) diff --git a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp index 7aa643dcb1..239317453e 100644 --- a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp +++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp @@ -21,7 +21,6 @@ #include #include -#include namespace armnn { diff --git a/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.cpp b/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.cpp new file mode 100644 index 0000000000..01a36f2a8b --- /dev/null +++ b/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.cpp @@ -0,0 +1,210 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaDepthwiseConvolution2d.hpp" + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status GpuFsaDepthwiseConvolution2dValidate(const TensorInfo& input, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) +{ + // Create a new workload sketch, for validation purposes + auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); + auto workloadContext = GpuWorkloadContext(&compileCtx); + GpuWorkloadSketch sketch{ &workloadContext }; + + // Build and create tensor infos using the sketch + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + + // ArmNN format for weights for depthwise is [1, H, W, C] independently of the input/output layout + // + // ACL format for weights for depthwise is: + // - [1, H, W, C] for [N, H, W, C] input/output layout (matches with ArmNN) + // - [1, C, H, W] for [N, C, H, W] input/output layout + // + // Therefore ArmNN weights have to be permuted when input/output layout is [N, C, H, W] to pass them to ACL. + // The PermuteDepthwiseConv2dWeights backend optimization takes care of this, but it has not been performed yet, + // so we do the permute here for the TensorInfo weights. + unsigned int aclDepthMultiplier; + TensorInfo weightsPermuted; + std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input,descriptor.m_DataLayout); + auto weightsShape = weightsPermuted.GetShape(); + weightsPermuted.SetShape({weightsShape[1], weightsShape[2], weightsShape[3]}); + + arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + + auto inputInfo = workloadContext.create_tensor_info(aclInputInfo); + auto weightInfo = workloadContext.create_tensor_info(aclWeightsInfo); + + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + arm_compute::TensorInfo aclBiasInfo; + arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr; + + if (descriptor.m_BiasEnabled) + { + if(!biases.has_value()) + { + throw InvalidArgumentException( + "GpuFsaDepthwiseConvolution2dValidate: No biases set when biases are enabled"); + } + aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); + aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); + + biasSketchInfoPtr = workloadContext.create_tensor_info(aclBiasInfo); + } + + // Set DepthwiseConv2d attributes using descriptor + const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, + descriptor.m_DilationY); + const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); + const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); + + DepthwiseConv2dAttributes depthwiseConv2dAttributes{}; + depthwiseConv2dAttributes.pad(aclPadInfo); + depthwiseConv2dAttributes.stride(aclStrideInfo); + depthwiseConv2dAttributes.dilation(aclDilationInfo); + depthwiseConv2dAttributes.depth_multiplier(aclDepthMultiplier); + + // Validate operator, check status and update reasonIfUnsupported + arm_compute::Status aclStatus = GpuDepthwiseConv2d::validate_op(sketch, + inputInfo, + weightInfo, + biasSketchInfoPtr, + depthwiseConv2dAttributes); + + return aclStatus; +} + +void GpuFsaDepthwiseConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, + const TensorInfo& input, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) +{ +/* +* Creating an Op for the GpuFsa backend requires us to create and maintain quite a bit of data, which is then stored +* in a GpuFsaPreCompiledBlob for execution later. Specifically we need: +* GpuWorkloadContext, this contains the TensorInfos and is unique to the Graph being executed +* Sketch, this is similar to a subgraph and can contain one or more operations. Multiple ops can be "fused" together +* using a single sketch. +* The inputTensorinfos / outputTensorInfos, these are pointers to the TensorInfos used when creating the sketch. +* They refer to the TensorInfos stored within the GpuWorkloadContext and are needed when executing the sketch +* as the TensorInfos used when creating the Tensors must match those used to create the Sketch. Otherwise the runtime +* doesn't know which Tensors to use. +*/ + using namespace arm_compute::experimental::dynamic_fusion; + GpuWorkloadSketch* sketch = blob->sketch.get(); + GpuWorkloadContext* workloadContext = blob->workloadContext.get(); + std::vector inputTensorInfos = {}; + std::vector outputTensorInfos = {}; + + // Build and create tensor infos using the sketch + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + + // ArmNN format for weights for depthwise is [1, H, W, C] independently of the input/output layout + // + // ACL format for weights for depthwise is: + // - [1, H, W, C] for [N, H, W, C] input/output layout (matches with ArmNN) + // - [1, C, H, W] for [N, C, H, W] input/output layout + // + // Therefore ArmNN weights have to be permuted when input/output layout is [N, C, H, W] to pass them to ACL. + // The PermuteDepthwiseConv2dWeights backend optimization takes care of this, but it has not been performed yet, + // so we do the permute here for the TensorInfo weights. + unsigned int aclDepthMultiplier; + TensorInfo weightsPermuted; + std::tie(weightsPermuted, aclDepthMultiplier) = Convert1HWOTensorInfoToAcl(weights, input,descriptor.m_DataLayout); + auto weightsShape = weightsPermuted.GetShape(); + weightsPermuted.SetShape({weightsShape[1], weightsShape[2], weightsShape[3]}); + + arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weightsPermuted, descriptor.m_DataLayout); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclInputInfo)); + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclWeightsInfo)); + + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + arm_compute::TensorInfo aclBiasInfo; + arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr; + + if (descriptor.m_BiasEnabled) + { + if(!biases.has_value()) + { + throw InvalidArgumentException("GpuFsaConvolution2dValidate: No biases set when biases are enabled"); + } + aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); + aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); + + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclBiasInfo)); + biasSketchInfoPtr = inputTensorInfos[2]; + } + + // Set DepthwiseConv2d attributes using descriptor + const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, + descriptor.m_DilationY); + const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); + const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); + + DepthwiseConv2dAttributes depthwiseConv2dAttributes{}; + depthwiseConv2dAttributes.pad(aclPadInfo); + depthwiseConv2dAttributes.stride(aclStrideInfo); + depthwiseConv2dAttributes.dilation(aclDilationInfo); + depthwiseConv2dAttributes.depth_multiplier(aclDepthMultiplier); + + // Validate operator, check status and update reasonIfUnsupported + arm_compute::Status aclStatus = GpuDepthwiseConv2d::validate_op(*sketch, + inputTensorInfos[0], + inputTensorInfos[1], + biasSketchInfoPtr, + depthwiseConv2dAttributes); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported) + { + throw BackendCapabilityException( + "\"GpuFsa\" backend failed during DepthwiseConvolution2D operation validation"); + } + + // Create the Op within the Sketch using the TensorInfos we have stored + arm_compute::ITensorInfo* convOutInfo = GpuDepthwiseConv2d::create_op(*sketch, + inputTensorInfos[0], + inputTensorInfos[1], + biasSketchInfoPtr, + depthwiseConv2dAttributes); + + outputTensorInfos.emplace_back(workloadContext->create_tensor_info()); + GpuOutput::create_op(*sketch, convOutInfo, outputTensorInfos[0]); + + // Store the TensorInfos within the blob as unique_ptrs to be used later + blob->inputTensorInfos = std::make_unique>(inputTensorInfos); + blob->outputTensorInfos = std::make_unique>(outputTensorInfos); +} + +} // namespace armnn diff --git a/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.hpp b/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.hpp new file mode 100644 index 0000000000..b705096ef3 --- /dev/null +++ b/src/backends/gpuFsa/layers/GpuFsaDepthwiseConvolution2d.hpp @@ -0,0 +1,30 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +#include +#include +#include + +namespace armnn +{ + +using namespace arm_compute::experimental::dynamic_fusion; + +arm_compute::Status GpuFsaDepthwiseConvolution2dValidate(const TensorInfo& input, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases); + +void GpuFsaDepthwiseConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, + const TensorInfo& input, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases); + +} // namespace armnn diff --git a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp index c2cdd57574..79dd9d357d 100644 --- a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp +++ b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp @@ -6,6 +6,8 @@ #include "backendsCommon/test/EndToEndTestImpl.hpp" #include "backendsCommon/test/Convolution2dEndToEndTestImpl.hpp" +#include "backendsCommon/test/DepthwiseConvolution2dEndToEndTests.hpp" + #include TEST_SUITE("GpuFsaEndToEnd") @@ -24,4 +26,10 @@ TEST_CASE("GpuFsaConv2dWithoutBiasEndtoEndTestFloat32") Convolution2dEndToEnd(gpuFsaDefaultBackends, armnn::DataLayout::NHWC, false); } +TEST_CASE("GpuFsaDepthwiseConvolution2dEndtoEndTestFloat32") +{ + DepthwiseConvolution2dEndToEnd(gpuFsaDefaultBackends, + armnn::DataLayout::NHWC); +} + } diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index 63f76b17b1..199fbdfafd 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -652,6 +653,12 @@ TEST_CASE("RefConvolution3dInt8Test") armnn::DataLayout::NDHWC); } +TEST_CASE("RefDepthwiseConvolution2dEndtoEndFloat32Test") +{ + DepthwiseConvolution2dEndToEnd(defaultBackends, + armnn::DataLayout::NHWC); +} + TEST_CASE("RefEluEndToEndTestFloat32") { EluEndToEndTest(defaultBackends); -- cgit v1.2.1