diff options
author | David Monahan <david.monahan@arm.com> | 2023-12-08 12:50:02 +0000 |
---|---|---|
committer | David Monahan <david.monahan@arm.com> | 2024-01-22 16:23:48 +0000 |
commit | bd738081b8fcea4599a06f01d5c07979f3b0fcb3 (patch) | |
tree | fa0c5ef9765e5f4de8551114752473c0a71747cd | |
parent | 5bc14146eb310465f3d7c59ac294aa32bc69984a (diff) | |
download | armnn-bd738081b8fcea4599a06f01d5c07979f3b0fcb3.tar.gz |
IVGCVSW-7165 - PreCompiledWorkload and Conv2d Integration work for GpuFsa
* Add PreCompiledWorkload implementation for GpuFsa
* Add ConstantWorkload implementation for GpuFsa
* Add Input/Output workloads for GpuFsa
* Added CopyMemGeneric workload for GpuFsa
* Separate creation and validation of sketch tensors into seperate functions
Signed-off-by: Kevin May <kevin.may@arm.com>
Signed-off-by: David Monahan <david.monahan@arm.com>
Change-Id: Ie7299a4c61073b5ca03d9f8681458869ef7ce743
18 files changed, 794 insertions, 175 deletions
diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt index 8d1a58ee27..5181f2288e 100644 --- a/src/backends/gpuFsa/CMakeLists.txt +++ b/src/backends/gpuFsa/CMakeLists.txt @@ -23,7 +23,7 @@ if(ARMCOMPUTEGPUFSA) GpuFsaWorkloadFactory.hpp ) - add_subdirectory(layerValidators) + add_subdirectory(layers) add_subdirectory(workloads) if(BUILD_UNIT_TESTS) diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp index 8ea9e8e7d3..9886a6e187 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.cpp +++ b/src/backends/gpuFsa/GpuFsaBackend.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -20,10 +20,7 @@ #include <arm_compute/core/CL/CLKernelLibrary.h> #include <arm_compute/runtime/CL/CLBufferAllocator.h> -#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> -#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h> - -#include "layerValidators/GpuFsaConvolution2dValidate.hpp" +#include "layers/GpuFsaConvolution2d.hpp" namespace armnn { @@ -218,9 +215,6 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra OptimizationViews optimizationViews(modelOptions); using namespace arm_compute::experimental::dynamic_fusion; - // Create a new workload sketch, for validation purposes - auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); - auto gpuCtx = GpuWorkloadContext(&compileCtx); auto it = subgraph.end(); std::map<LayerGuid, Layer*> untouched; @@ -233,32 +227,41 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra GpuFsaLayerSupport supportChecker; it = subgraph.end(); + arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context()); + + // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos + std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx); while (it != subgraph.begin()) { --it; Layer& base = *(PolymorphicDowncast<Layer*>(*it)); + // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator + GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob(); + preCompiledBlobPtr->workloadContext = workloadContext; + preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get()); - std::unique_ptr<GpuWorkloadSketch> sketch = std::make_unique<GpuWorkloadSketch>(&gpuCtx); + // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer switch (base.GetType()) { case (LayerType::Convolution2d): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); - //std::vector<TensorInfo> infos = {input, weights}; auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters()); if (desc->m_BiasEnabled) { auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo(); - GpuFsaConvolution2dCreateOp(input, + GpuFsaConvolution2dCreateOp(preCompiledBlobPtr, + input, *desc, weights, bias); } else { - GpuFsaConvolution2dCreateOp(input, + GpuFsaConvolution2dCreateOp(preCompiledBlobPtr, + input, *desc, weights, EmptyOptional()); @@ -270,7 +273,8 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra continue; } - auto compiledBlob = std::make_unique<PreCompiledObjectPtr>(sketch.release(), DeleteAsType<GpuWorkloadSketch>); + auto compiledBlob = + std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>); IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer( PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()), @@ -289,7 +293,7 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra CreateOutputsFrom(&base), {&base}); - optimizationViews.AddSubstitution({ *substituteSubgraph, SubgraphView(preCompiledLayer) }); + optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) }); untouched.erase(base.GetGuid()); } diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp index 26960065c7..4c2a5f02e3 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.hpp +++ b/src/backends/gpuFsa/GpuFsaBackend.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -11,6 +11,8 @@ #include <arm_compute/runtime/CL/CLMemoryRegion.h> #include <arm_compute/core/CL/CLKernelLibrary.h> #include <CL/cl_ext.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h> // System includes for mapping and unmapping memory #include <sys/mman.h> @@ -18,13 +20,31 @@ namespace armnn { +/** + * A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend + * + * @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused. + * @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which contains TensorInfos + * @param[in, out] inputIds A unique pointer to a vector of input Ids used to access workloadContext TensorInfos + * @param[in, out] outputIds A unique pointer to a vector of output Ids used to access workloadContext TensorInfos + * + */ +struct GpuFsaPreCompiledBlob +{ + std::unique_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch> sketch = nullptr; + std::shared_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadContext> workloadContext = nullptr; + + std::unique_ptr<std::vector<int32_t>> inputIds = nullptr; + std::unique_ptr<std::vector<int32_t>> outputIds = nullptr; +}; + // add new capabilities here.. const BackendCapabilities gpuFsaCapabilities("GpuFsa", { {"NonConstWeights", false}, {"AsyncExecution", false}, {"ProtectedContentAllocation", false}, - {"ConstantTensorsAsInputs", false}, + {"ConstantTensorsAsInputs", true}, {"PreImportIOTensors", false}, {"ExternallyManagedMemory", false}, {"MultiAxisPacking", false}, diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp index 063af2732e..96c986ab33 100644 --- a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp +++ b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -10,7 +10,7 @@ #include <armnn/utility/PolymorphicDowncast.hpp> #if defined(ARMCOMPUTEGPUFSA_ENABLED) -#include "layerValidators/GpuFsaConvolution2dValidate.hpp" +#include "layers/GpuFsaConvolution2d.hpp" #endif #include <vector> diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp index 6d13879f51..faa0d38386 100644 --- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -9,6 +9,11 @@ #include "GpuFsaBackendId.hpp" #include "GpuFsaTensorHandle.hpp" +#include "workloads/GpuFsaConstantWorkload.hpp" +#include "workloads/GpuFsaPreCompiledWorkload.hpp" + +#include <armnn/backends/MemCopyWorkload.hpp> + namespace armnn { @@ -43,11 +48,13 @@ bool IsDataType(const WorkloadInfo& info) GpuFsaWorkloadFactory::GpuFsaWorkloadFactory(const std::shared_ptr<GpuFsaMemoryManager>& memoryManager) : m_MemoryManager(memoryManager) { + InitializeCLCompileContext(); } GpuFsaWorkloadFactory::GpuFsaWorkloadFactory() : m_MemoryManager(new GpuFsaMemoryManager()) { + InitializeCLCompileContext(); } const BackendId& GpuFsaWorkloadFactory::GetBackendId() const @@ -81,11 +88,52 @@ std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const T return tensorHandle; } -std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/, - const QueueDescriptor& /*descriptor*/, - const WorkloadInfo& /*info*/) const + +void GpuFsaWorkloadFactory::InitializeCLCompileContext() { + // Initialize our m_CLCompileContext using default device and context + auto context = arm_compute::CLKernelLibrary::get().context(); + auto device = arm_compute::CLKernelLibrary::get().get_device(); + m_CLCompileContext = arm_compute::CLCompileContext(context, device); +} + +std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType type, + const QueueDescriptor& descriptor, + const WorkloadInfo& info) const { - return nullptr; + switch(type) + { + case LayerType::Constant : + { + auto constQueueDescriptor = PolymorphicDowncast<const ConstantQueueDescriptor*>(&descriptor); + return std::make_unique<GpuFsaConstantWorkload>(*constQueueDescriptor, info, m_CLCompileContext); + } + case LayerType::Input : + { + auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor); + return std::make_unique<CopyMemGenericWorkload>(*inputQueueDescriptor, info); + } + case LayerType::Output : + { + auto outputQueueDescriptor = PolymorphicDowncast<const OutputQueueDescriptor*>(&descriptor); + return std::make_unique<CopyMemGenericWorkload>(*outputQueueDescriptor, info); + } + case LayerType::MemCopy : + { + auto memCopyQueueDescriptor = PolymorphicDowncast<const MemCopyQueueDescriptor*>(&descriptor); + if (memCopyQueueDescriptor->m_Inputs.empty() || !memCopyQueueDescriptor->m_Inputs[0]) + { + throw InvalidArgumentException("GpuFsaWorkloadFactory: Invalid null input for MemCopy workload"); + } + return std::make_unique<CopyMemGenericWorkload>(*memCopyQueueDescriptor, info); + } + case LayerType::PreCompiled : + { + auto precompiledQueueDescriptor = PolymorphicDowncast<const PreCompiledQueueDescriptor*>(&descriptor); + return std::make_unique<GpuFsaPreCompiledWorkload>(*precompiledQueueDescriptor, info); + } + default : + return nullptr; + } } } // namespace armnn
\ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp index 9b97070766..04074cf0ab 100644 --- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -8,6 +8,8 @@ #include <armnn/Optional.hpp> +#include <arm_compute/core/CL/CLCompileContext.h> + namespace armnn { @@ -44,6 +46,7 @@ public: std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo, DataLayout dataLayout, const bool IsMemoryManaged = true) const override; + void InitializeCLCompileContext(); std::unique_ptr<IWorkload> CreateWorkload(LayerType type, const QueueDescriptor& descriptor, @@ -54,6 +57,7 @@ private: std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const; mutable std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager; + arm_compute::CLCompileContext m_CLCompileContext; }; } // namespace armnn diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk index d8d254205b..a219ad4fec 100644 --- a/src/backends/gpuFsa/backend.mk +++ b/src/backends/gpuFsa/backend.mk @@ -1,5 +1,5 @@ # -# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # @@ -22,35 +22,29 @@ BACKEND_SOURCES := \ GpuFsaRegistryInitializer.cpp \ GpuFsaTensorHandleFactory.cpp \ GpuFsaWorkloadFactory.cpp \ - layerValidators/GpuFsaConvolution2dValidate.cpp -else - -# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 -# No source file will be compiled for the GPU Dynamic Fusion backend - -BACKEND_SOURCES := - -endif + layers/GpuFsaConvolution2d.cpp # BACKEND_TEST_SOURCES contains the list of files to be included # in the Android unit test build (armnn-tests) and it is picked # up by the Android.mk file in the root of ArmNN -# The variable to enable/disable the GPU Dynamic Fusion backend -# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk) -ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1) - # ARMNN_COMPUTE_GPUFSA_ENABLED == 1 # Include the source files for the GPU Dynamic Fusion backend tests BACKEND_TEST_SOURCES := \ - test/GpuFsaEndToEndTests.cpp \ + test/GpuFsaEndToEndTests.cpp \ test/GpuFsaLayerSupportTests.cpp \ test/GpuFsaLayerTests.cpp \ test/GpuFsaOptimizedNetworkTests.cpp else # ARMNN_COMPUTE_GPUFSA_ENABLED == 0 +# No source file will be compiled for the GPU Dynamic Fusion backend + +BACKEND_SOURCES := + + +# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 # No source file will be compiled for the GPU Dynamic Fusion backend tests BACKEND_TEST_SOURCES := diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp deleted file mode 100644 index bed7b26f74..0000000000 --- a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// -// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "GpuFsaConvolution2dValidate.hpp" - -#include <armnn/Types.hpp> -#include <armnn/utility/IgnoreUnused.hpp> - -#include <aclCommon/ArmComputeTensorUtils.hpp> - -#include <arm_compute/core/ITensorInfo.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/CL/CLKernelLibrary.h> -#include <arm_compute/core/CL/CLCompileContext.h> - -#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h> -#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> -#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h> -#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h> - -#include <vector> -#include <iostream> - -namespace armnn -{ - -using namespace armcomputetensorutils; - -inline arm_compute::Status ValidateAndCreateOp(const TensorInfo& input, - const Convolution2dDescriptor& descriptor, - const TensorInfo& weights, - const Optional<TensorInfo>& biases, - const bool createOp = false) -{ - // Create a new workload sketch, for validation purposes - auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); - auto gpuCtx = GpuWorkloadContext(&compileCtx); - GpuWorkloadSketch sketch{ &gpuCtx }; - - // Build and create tensor infos using the sketch - const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); - arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); - aclWeightsInfo.set_are_values_constant(weights.IsConstant()); - - auto inputInfo = gpuCtx.create_tensor_info(aclInputInfo); - auto weightInfo = gpuCtx.create_tensor_info(aclWeightsInfo); - - // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op - arm_compute::TensorInfo aclBiasInfo; - arm_compute::TensorInfo biasSketchInfo; - arm_compute::TensorInfo* biasSketchInfoPtr = nullptr; - - if (descriptor.m_BiasEnabled) - { - if(!biases.has_value()) - { - throw InvalidArgumentException("GpuFsaConvolution2dValidate: No biases set when biases are enabled"); - } - aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); - aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); - - biasSketchInfo = gpuCtx.create_tensor_info(aclBiasInfo); - biasSketchInfoPtr = &biasSketchInfo; - } - - // Set Conv2d attributes using descriptor - const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, - descriptor.m_DilationY); - const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); - const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); - - Conv2dAttributes conv2DAttributes{}; - conv2DAttributes.dilation(aclDilationInfo); - conv2DAttributes.pad(aclPadInfo); - conv2DAttributes.stride(aclStrideInfo); - - // Validate operator, check status and update reasonIfUnsupported - arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch, - &inputInfo, - &weightInfo, - biasSketchInfoPtr, - conv2DAttributes); - - if (createOp) - { - const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); - if (!supported) - { - throw BackendCapabilityException("\"GpuFsa\" backend failed during operation validation when attempting " - "to fuse a GpuConv2d operator into the existing workload sketch."); - } - - arm_compute::ITensorInfo* convOutInfo = GpuConv2d::create_op(sketch, - &inputInfo, - &weightInfo, - biasSketchInfoPtr, - conv2DAttributes); - - // Temporary fix until fusing attempt is make for GpuFsa backend and Output layer workload is created. - auto outputInfo = gpuCtx.create_tensor_info(); - GpuOutput::create_op(sketch, convOutInfo, &outputInfo); - } - - return aclStatus; -} - -arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, - const Convolution2dDescriptor& descriptor, - const TensorInfo& weights, - const Optional<TensorInfo>& biases) -{ - return ValidateAndCreateOp(input, descriptor, weights, biases); -} - -void GpuFsaConvolution2dCreateOp(const TensorInfo& input, - const Convolution2dDescriptor& descriptor, - const TensorInfo& weights, - const Optional<TensorInfo>& biases) -{ - ValidateAndCreateOp(input, descriptor, weights, biases, true); -} - -} // namespace armnn
\ No newline at end of file diff --git a/src/backends/gpuFsa/layerValidators/CMakeLists.txt b/src/backends/gpuFsa/layers/CMakeLists.txt index 57ea41d56c..3a02ce1a77 100644 --- a/src/backends/gpuFsa/layerValidators/CMakeLists.txt +++ b/src/backends/gpuFsa/layers/CMakeLists.txt @@ -1,11 +1,11 @@ # -# Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # list(APPEND armnnGpuFsaBackendLayerValidators_sources - GpuFsaConvolution2dValidate.cpp - GpuFsaConvolution2dValidate.hpp + GpuFsaConvolution2d.cpp + GpuFsaConvolution2d.hpp ) add_library(armnnGpuFsaBackendLayerValidators OBJECT ${armnnGpuFsaBackendLayerValidators_sources}) diff --git a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp new file mode 100644 index 0000000000..c7137d7ac8 --- /dev/null +++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp @@ -0,0 +1,180 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaConvolution2d.hpp" + +#include <armnn/Types.hpp> + +#include <aclCommon/ArmComputeTensorUtils.hpp> + +#include <arm_compute/core/ITensorInfo.h> +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/core/TensorShape.h> +#include <arm_compute/core/CL/CLKernelLibrary.h> +#include <arm_compute/core/CL/CLCompileContext.h> + +#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> +#include <src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h> + +#include <vector> +#include <iostream> + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional<TensorInfo>& biases) +{ + // Create a new workload sketch, for validation purposes + auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); + auto workloadContext = GpuWorkloadContext(&compileCtx); + GpuWorkloadSketch sketch{ &workloadContext }; + + // Build and create tensor infos using the sketch + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + + auto inputInfo = workloadContext.create_tensor_info(aclInputInfo); + auto weightInfo = workloadContext.create_tensor_info(aclWeightsInfo); + + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + arm_compute::TensorInfo aclBiasInfo; + arm_compute::TensorInfo biasSketchInfo; + arm_compute::TensorInfo* biasSketchInfoPtr = nullptr; + + if (descriptor.m_BiasEnabled) + { + if(!biases.has_value()) + { + throw InvalidArgumentException("GpuFsaConvolution2d::ValidateOp: No biases set when biases are enabled"); + } + aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); + aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); + + biasSketchInfo = workloadContext.create_tensor_info(aclBiasInfo); + biasSketchInfoPtr = &biasSketchInfo; + } + + // Set Conv2d attributes using descriptor + const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, + descriptor.m_DilationY); + const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); + const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); + + Conv2dAttributes conv2DAttributes{}; + conv2DAttributes.dilation(aclDilationInfo); + conv2DAttributes.pad(aclPadInfo); + conv2DAttributes.stride(aclStrideInfo); + + // Validate operator, check status and update reasonIfUnsupported + arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch, + &inputInfo, + &weightInfo, + biasSketchInfoPtr, + conv2DAttributes); + + return aclStatus; +} + +void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, + const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional<TensorInfo>& biases) +{ +/* + * Creating an Op for the GpuFds backend requires us to create and maintain quite a bit of data, which is then stored + * in a GpuFsaPreCompiledBlob for execution later. Specifically we need: + * GpuWorkloadContext, this contains the TensorInfos and is unique to the Graph being executed + * Sketch, this is similar to a subgraph and can contain one or more operations. Multiple ops can be "fused" together + * using a single sketch. + * The TensorInfoIds, these are the ids of the TensorInfos used when creating the sketch. They refer to the TensorInfos + * stored within the GpuWorkloadContext and are used to fetch them later when executing the sketch. + */ + using namespace arm_compute::experimental::dynamic_fusion; + GpuWorkloadSketch* sketch = blob->sketch.get(); + GpuWorkloadContext* workloadContext = blob->workloadContext.get(); + std::vector<int32_t> inputIds = {}; + std::vector<int32_t> outputIds = {}; + + // Build and create tensor infos using the sketch + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + auto inputInfo = workloadContext->create_tensor_info(aclInputInfo); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + inputIds.emplace_back(inputInfo.id()); + + auto weightInfo = workloadContext->create_tensor_info(aclWeightsInfo); + inputIds.emplace_back(weightInfo.id()); + + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + arm_compute::TensorInfo aclBiasInfo; + arm_compute::TensorInfo biasSketchInfo; + arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr; + + if (descriptor.m_BiasEnabled) + { + if(!biases.has_value()) + { + throw InvalidArgumentException("GpuFsaConvolution2d::CreateOp: No biases set when biases are enabled"); + } + aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); + aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); + + biasSketchInfo = workloadContext->create_tensor_info(aclBiasInfo); + inputIds.emplace_back(biasSketchInfo.id()); + biasSketchInfoPtr = workloadContext->implementation().get_tensor_info(biasSketchInfo.id()); + } + + // Set Conv2d attributes using descriptor + const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, + descriptor.m_DilationY); + const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); + const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); + + Conv2dAttributes conv2DAttributes{}; + conv2DAttributes.dilation(aclDilationInfo); + conv2DAttributes.pad(aclPadInfo); + conv2DAttributes.stride(aclStrideInfo); + + // Validate operator, check status and update reasonIfUnsupported + arm_compute::Status aclStatus = + GpuConv2d::validate_op(*sketch, + workloadContext->implementation().get_tensor_info(inputInfo.id()), + workloadContext->implementation().get_tensor_info(weightInfo.id()), + biasSketchInfoPtr, + conv2DAttributes); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported) + { + throw BackendCapabilityException("\"GpuFsa\" backend failed during Convolution2D operation validation"); + } + + arm_compute::ITensorInfo* convOutInfo = + GpuConv2d::create_op(*sketch, + workloadContext->implementation().get_tensor_info(inputInfo.id()), + workloadContext->implementation().get_tensor_info(weightInfo.id()), + biasSketchInfoPtr, + conv2DAttributes); + + arm_compute::TensorInfo outputDstInfo = workloadContext->create_tensor_info(); + outputIds.emplace_back(outputDstInfo.id()); + + GpuOutput::create_op(*sketch, convOutInfo, workloadContext->implementation().get_tensor_info(outputDstInfo.id())); + blob->inputIds = std::make_unique<std::vector<int32_t>>(inputIds); + blob->outputIds = std::make_unique<std::vector<int32_t>>(outputIds); +} + +} // namespace armnn diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.hpp index 120060e8ad..3346dc1028 100644 --- a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp +++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -9,6 +9,7 @@ #include <arm_compute/core/Error.h> #include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h> +#include <gpuFsa/GpuFsaBackend.hpp> namespace armnn { @@ -20,9 +21,10 @@ arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, const TensorInfo& weights, const Optional<TensorInfo>& biases); -void GpuFsaConvolution2dCreateOp(const TensorInfo& input, +void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, + const TensorInfo& input, const Convolution2dDescriptor& descriptor, const TensorInfo& weights, const Optional<TensorInfo>& biases); -} // namespace armnn
\ No newline at end of file +} // namespace armnn diff --git a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp index 1d6b99a31f..c2cdd57574 100644 --- a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp +++ b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp @@ -1,8 +1,27 @@ // -// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "backendsCommon/test/EndToEndTestImpl.hpp" -#include <doctest/doctest.h>
\ No newline at end of file +#include "backendsCommon/test/Convolution2dEndToEndTestImpl.hpp" +#include <doctest/doctest.h> + +TEST_SUITE("GpuFsaEndToEnd") +{ + +std::vector<BackendId> gpuFsaDefaultBackends = {"GpuFsa"}; + +// Conv2d +TEST_CASE("GpuFsaConv2dEndtoEndTestFloat32") +{ + Convolution2dEndToEnd<armnn::DataType::Float32>(gpuFsaDefaultBackends, armnn::DataLayout::NHWC); +} + +TEST_CASE("GpuFsaConv2dWithoutBiasEndtoEndTestFloat32") +{ + Convolution2dEndToEnd<armnn::DataType::Float32>(gpuFsaDefaultBackends, armnn::DataLayout::NHWC, false); +} + +} diff --git a/src/backends/gpuFsa/workloads/CMakeLists.txt b/src/backends/gpuFsa/workloads/CMakeLists.txt index 4d100123ea..9edc9e9d3c 100644 --- a/src/backends/gpuFsa/workloads/CMakeLists.txt +++ b/src/backends/gpuFsa/workloads/CMakeLists.txt @@ -1,10 +1,15 @@ # -# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # list(APPEND armnnGpuFsaBackendWorkloads_sources GpuFsaBaseWorkload.hpp + GpuFsaConstantWorkload.hpp + GpuFsaConstantWorkload.cpp + GpuFsaPreCompiledWorkload.hpp + GpuFsaPreCompiledWorkload.cpp + GpuFsaWorkloadUtils.hpp ) add_library(armnnGpuFsaBackendWorkloads OBJECT ${armnnGpuFsaBackendWorkloads_sources}) diff --git a/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp new file mode 100644 index 0000000000..39d3c0ddab --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp @@ -0,0 +1,114 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaConstantWorkload.hpp" +#include "GpuFsaWorkloadUtils.hpp" + +#include <Half.hpp> +#include <aclCommon/ArmComputeTensorUtils.hpp> +#include <gpuFsa/GpuFsaTensorHandle.hpp> +#include <armnn/backends/TensorHandle.hpp> + +namespace armnn +{ + +arm_compute::Status GpuFsaConstantWorkloadValidate(const TensorInfo& output) +{ + const arm_compute::TensorInfo neonOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + std::array<arm_compute::DataType,8> supportedTypes = { + arm_compute::DataType::F16, + arm_compute::DataType::F32, + arm_compute::DataType::QASYMM8, + arm_compute::DataType::QASYMM8_SIGNED, + arm_compute::DataType::QSYMM16, + arm_compute::DataType::QSYMM8, + arm_compute::DataType::QSYMM8_PER_CHANNEL, + arm_compute::DataType::S32 + }; + auto it = std::find(begin(supportedTypes), end(supportedTypes), neonOutputInfo.data_type()); + + if (it != end(supportedTypes)) + { + return arm_compute::Status{}; + } + else + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported DataType"}; + } +} + +GpuFsaConstantWorkload::GpuFsaConstantWorkload(const ConstantQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext&) + : GpuFsaBaseWorkload<ConstantQueueDescriptor>(descriptor, info) + , m_RanOnce(false) +{ +} + +void GpuFsaConstantWorkload::Execute() const +{ + // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data + // on the first inference, then reused for subsequent inferences. + // The initialisation cannot happen at workload construction time since the ACL kernel for the next layer may not + // have been configured at the time. + if (!m_RanOnce) + { + const ConstantQueueDescriptor& data = this->m_Data; + + ARMNN_ASSERT(data.m_LayerOutput != nullptr); + arm_compute::CLTensor& output = static_cast<GpuFsaTensorHandle*>(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = static_cast<GpuFsaTensorHandle*>(data.m_Outputs[0])->GetDataType(); + + switch (computeDataType) + { + case arm_compute::DataType::F16: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<Half>()); + break; + } + case arm_compute::DataType::F32: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<float>()); + break; + } + case arm_compute::DataType::QASYMM8: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<uint8_t>()); + break; + } + case arm_compute::DataType::QASYMM8_SIGNED: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int8_t>()); + break; + } + case arm_compute::DataType::QSYMM16: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int16_t>()); + break; + } + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int8_t>()); + break; + } + case arm_compute::DataType::S32: + { + CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int32_t>()); + break; + } + default: + { + ARMNN_ASSERT_MSG(false, "Unknown data type"); + break; + } + } + + m_RanOnce = true; + } +} + +} //namespace armnn diff --git a/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp new file mode 100644 index 0000000000..98b383b89f --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp @@ -0,0 +1,30 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "GpuFsaBaseWorkload.hpp" + +#include <arm_compute/core/Error.h> +#include <arm_compute/core/CL/CLCompileContext.h> + +namespace armnn +{ + arm_compute::Status GpuFsaConstantWorkloadValidate(const TensorInfo& output); + + class GpuFsaConstantWorkload : public GpuFsaBaseWorkload<ConstantQueueDescriptor> + { + public: + GpuFsaConstantWorkload(const ConstantQueueDescriptor& descriptor, + const WorkloadInfo& info, + const arm_compute::CLCompileContext& clCompileContext); + + void Execute() const override; + + private: + mutable bool m_RanOnce; + }; + +} //namespace armnn diff --git a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp new file mode 100644 index 0000000000..20386b5d86 --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp @@ -0,0 +1,106 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaPreCompiledWorkload.hpp" +#include "GpuFsaWorkloadUtils.hpp" +#include "armnn/utility/PolymorphicDowncast.hpp" + +#include <gpuFsa/GpuFsaTensorHandle.hpp> +#include <gpuFsa/GpuFsaBackend.hpp> +#include <aclCommon/ArmComputeTensorUtils.hpp> + +#include <aclCommon/ArmComputeTensorUtils.hpp> +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/core/ITensorInfo.h> +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/core/TensorShape.h> +#include <arm_compute/core/CL/CLKernelLibrary.h> +#include <arm_compute/core/CL/CLCompileContext.h> + +#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> +#include <src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h> + +namespace armnn { + +GpuFsaPreCompiledWorkload::GpuFsaPreCompiledWorkload(const PreCompiledQueueDescriptor &descriptor, + const WorkloadInfo &info) + : BaseWorkload<PreCompiledQueueDescriptor>(descriptor, info), m_workloadInfo(info) +{ + // Check that the workload is holding a pointer to a valid pre-compiled object + if (m_Data.m_PreCompiledObject == nullptr) + { + throw InvalidArgumentException( + "GpuFsaPrecompiledWorkload requires a valid pre-compiled object (GpuWorkloadSketch)."); + } +} + +void GpuFsaPreCompiledWorkload::Execute() const +{ +/* + * The Execute function of the GpuFsa Backends PreCompiled workload needs to jump through various hoops in order to + * create a valid sketch and runtime that can execute the kernel + * First we need all of the data stored within the PreCompiled blob which was used to setup the workload, namely: + * The GpuWorkloadContext, this is a context which contains the TensorInfos and is unique to the graph being run + * The Sketch, this can contain one or many ops and acts as a subgraph within the context + * The TensorInfoIds, These are the ids of the TensorInfos used during the creation of the Sketch and stored within + * the context. + * It is very important that the Tensors passed into the Runtime being used to execute this sketch are created with + * the same TensorInfos as used when creating the sketch. We do this by creating new tensors, getting the original + * TensorInfos from the GpuWorkloadContext via their ids, and then importing the buffers from our own TensorHandles + * directly into these newly created Tensors. This allows us to link the externally visible Tensors from ArmNN to the + * Tensors which are needed to execute with the Sketch. + * + */ + using namespace arm_compute::experimental::dynamic_fusion; + // Get the runtime and configure it with the precompiled sketch + ClWorkloadRuntime runtime; + GpuFsaPreCompiledBlob *preCompiledBlob = static_cast<GpuFsaPreCompiledBlob*>(m_Data.m_PreCompiledObject); + auto workloadContext = + &(preCompiledBlob->workloadContext->implementation()); + auto sketch = preCompiledBlob->sketch.release(); + std::vector<int32_t> inputIds = *(preCompiledBlob->inputIds.get()); + std::vector<int32_t> outputIds = *(preCompiledBlob->outputIds.get()); + auto status = runtime.configure(*sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + for(auto &data : runtime.get_auxiliary_tensors()) + { + arm_compute::CLTensor* tensor = std::get<0>(data); + arm_compute::TensorInfo info = std::get<1>(data); + arm_compute::experimental::dynamic_fusion::AuxMemoryInfo aux_mem_req = std::get<2>(data); + tensor->allocator()->init(info, aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + } + + // Create and initialize user tensors + std::vector<arm_compute::CLTensor*> inputsWeightsOutputs; + inputsWeightsOutputs.reserve(m_Data.m_Inputs.size() + m_Data.m_Outputs.size()); + + for (uint32_t inputSlotIdx = 0; inputSlotIdx < m_Data.m_Inputs.size(); ++inputSlotIdx) + { + arm_compute::CLTensor* input = new arm_compute::CLTensor{}; + input->allocator()->init(*(dynamic_cast<arm_compute::TensorInfo*>( + workloadContext->get_tensor_info(inputIds[inputSlotIdx])))); + auto* inputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Inputs[inputSlotIdx]); + input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer()); + inputsWeightsOutputs.emplace_back(std::move(input)); + } + // Set the outputs + for (uint32_t outputSlotIdx = 0; outputSlotIdx < m_Data.m_Outputs.size(); ++outputSlotIdx) + { + arm_compute::CLTensor* output = new arm_compute::CLTensor{}; + output->allocator()->init(*(dynamic_cast<arm_compute::TensorInfo*>( + workloadContext->get_tensor_info(outputIds[outputSlotIdx])))); + auto* outputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Outputs[outputSlotIdx]); + output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer()); + inputsWeightsOutputs.emplace_back(std::move(output)); + } + runtime.run(inputsWeightsOutputs); +} +} // namespace armnn
\ No newline at end of file diff --git a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp new file mode 100644 index 0000000000..d29bf37e69 --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp @@ -0,0 +1,56 @@ +// +// Copyright © 2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "armnn/backends/Workload.hpp" + +#include <arm_compute/core/ITensorInfo.h> +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/core/TensorShape.h> +#include <arm_compute/core/CL/CLKernelLibrary.h> +#include <arm_compute/core/CL/CLCompileContext.h> + +#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h> +#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h> + +#include <memory> +#include <string> +#include <vector> + +namespace armnn +{ + +bool GpuFsaPreCompiledWorkloadValidate(std::string* reasonIfUnsupported); + +class GpuFsaPreCompiledWorkload : public BaseWorkload<PreCompiledQueueDescriptor> +{ +public: + GpuFsaPreCompiledWorkload(const PreCompiledQueueDescriptor& descriptor, + const WorkloadInfo& info); + void Execute() const override; + +private: + bool SupportsTensorHandleReplacement() const override + { + return true; + } + + void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override + { + this->m_Data.m_Inputs[slot] = tensorHandle; + } + + void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override + { + this->m_Data.m_Outputs[slot] = tensorHandle; + } + + WorkloadInfo m_workloadInfo; +}; + +} //namespace armnn
\ No newline at end of file diff --git a/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp b/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp new file mode 100644 index 0000000000..10954b07b5 --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp @@ -0,0 +1,163 @@ +// +// Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include <BFloat16.hpp> +#include <Half.hpp> + +#include <aclCommon/ArmComputeTensorUtils.hpp> +#include <armnn/backends/TensorHandle.hpp> + +#include <armnn/Utils.hpp> + +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/IFunction.h> + +#include <sstream> + + +namespace armnn +{ + + inline std::string GetConvolutionMethodString(arm_compute::ConvolutionMethod& convolutionMethod) + { + switch (convolutionMethod) + { + case arm_compute::ConvolutionMethod::FFT: + return "FFT"; + case arm_compute::ConvolutionMethod::DIRECT: + return "Direct"; + case arm_compute::ConvolutionMethod::GEMM: + return "GEMM"; + case arm_compute::ConvolutionMethod::WINOGRAD: + return "Winograd"; + default: + return "Unknown"; + } + } + + template <typename T> + void CopyArmComputeClTensorData(arm_compute::CLTensor& dstTensor, const T* srcData) + { + { + dstTensor.map(true); + } + + { + armcomputetensorutils::CopyArmComputeITensorData<T>(srcData, dstTensor); + } + + dstTensor.unmap(); + } + + inline auto SetClStridedSliceData(const std::vector<int>& m_begin, + const std::vector<int>& m_end, + const std::vector<int>& m_stride) + { + arm_compute::Coordinates starts; + arm_compute::Coordinates ends; + arm_compute::Coordinates strides; + + unsigned int num_dims = static_cast<unsigned int>(m_begin.size()); + + for (unsigned int i = 0; i < num_dims; i++) { + unsigned int revertedIndex = num_dims - i - 1; + + starts.set(i, static_cast<int>(m_begin[revertedIndex])); + ends.set(i, static_cast<int>(m_end[revertedIndex])); + strides.set(i, static_cast<int>(m_stride[revertedIndex])); + } + + return std::make_tuple(starts, ends, strides); + } + + inline auto SetClSliceData(const std::vector<unsigned int>& m_begin, + const std::vector<unsigned int>& m_size) + { + // This function must translate the size vector given to an end vector + // expected by the ACL NESlice workload + arm_compute::Coordinates starts; + arm_compute::Coordinates ends; + + unsigned int num_dims = static_cast<unsigned int>(m_begin.size()); + + // For strided slices, we have the relationship size = (end - begin) / stride + // For slice, we assume stride to be a vector of all ones, yielding the formula + // size = (end - begin) therefore we know end = size + begin + for (unsigned int i = 0; i < num_dims; i++) + { + unsigned int revertedIndex = num_dims - i - 1; + + starts.set(i, static_cast<int>(m_begin[revertedIndex])); + ends.set(i, static_cast<int>(m_begin[revertedIndex] + m_size[revertedIndex])); + } + + return std::make_tuple(starts, ends); + } + + inline void InitializeArmComputeClTensorData(arm_compute::CLTensor& clTensor, + const ConstTensorHandle* handle) + { + ARMNN_ASSERT(handle); + + armcomputetensorutils::InitialiseArmComputeTensorEmpty(clTensor); + switch(handle->GetTensorInfo().GetDataType()) + { + case DataType::Float16: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::Half>()); + break; + case DataType::Float32: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<float>()); + break; + case DataType::QAsymmU8: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<uint8_t>()); + break; + case DataType::QAsymmS8: + case DataType::QSymmS8: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int8_t>()); + break; + case DataType::QSymmS16: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int16_t>()); + break; + case DataType::Signed32: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int32_t>()); + break; + case DataType::BFloat16: + CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::BFloat16>()); + break; + default: + // Throw exception; assertion not called in release build. + throw Exception("Unexpected tensor type during InitializeArmComputeClTensorData()."); + } + }; + + inline RuntimeException WrapClError(const cl::Error& clError, const CheckLocation& location) + { + std::stringstream message; + message << "CL error: " << clError.what() << ". Error code: " << clError.err(); + + return RuntimeException(message.str(), location); + } + + inline void RunClFunction(arm_compute::IFunction& function, const CheckLocation& location) + { + try + { + function.run(); + } + catch (cl::Error& error) + { + throw WrapClError(error, location); + } + } + + template <typename DataType, typename PayloadType> + DataType* GetOutputTensorData(unsigned int idx, const PayloadType& data) + { + ITensorHandle* tensorHandle = data.m_Outputs[idx]; + return reinterpret_cast<DataType*>(tensorHandle->Map()); + } + +} //namespace armnn |