aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Monahan <david.monahan@arm.com>2023-12-08 12:50:02 +0000
committerDavid Monahan <david.monahan@arm.com>2024-01-22 16:23:48 +0000
commitbd738081b8fcea4599a06f01d5c07979f3b0fcb3 (patch)
treefa0c5ef9765e5f4de8551114752473c0a71747cd
parent5bc14146eb310465f3d7c59ac294aa32bc69984a (diff)
downloadarmnn-bd738081b8fcea4599a06f01d5c07979f3b0fcb3.tar.gz
IVGCVSW-7165 - PreCompiledWorkload and Conv2d Integration work for GpuFsa
* Add PreCompiledWorkload implementation for GpuFsa * Add ConstantWorkload implementation for GpuFsa * Add Input/Output workloads for GpuFsa * Added CopyMemGeneric workload for GpuFsa * Separate creation and validation of sketch tensors into seperate functions Signed-off-by: Kevin May <kevin.may@arm.com> Signed-off-by: David Monahan <david.monahan@arm.com> Change-Id: Ie7299a4c61073b5ca03d9f8681458869ef7ce743
-rw-r--r--src/backends/gpuFsa/CMakeLists.txt2
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.cpp32
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.hpp24
-rw-r--r--src/backends/gpuFsa/GpuFsaLayerSupport.cpp4
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp58
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp6
-rw-r--r--src/backends/gpuFsa/backend.mk24
-rw-r--r--src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp126
-rw-r--r--src/backends/gpuFsa/layers/CMakeLists.txt (renamed from src/backends/gpuFsa/layerValidators/CMakeLists.txt)6
-rw-r--r--src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp180
-rw-r--r--src/backends/gpuFsa/layers/GpuFsaConvolution2d.hpp (renamed from src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp)8
-rw-r--r--src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp23
-rw-r--r--src/backends/gpuFsa/workloads/CMakeLists.txt7
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp114
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp30
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp106
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp56
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp163
18 files changed, 794 insertions, 175 deletions
diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt
index 8d1a58ee27..5181f2288e 100644
--- a/src/backends/gpuFsa/CMakeLists.txt
+++ b/src/backends/gpuFsa/CMakeLists.txt
@@ -23,7 +23,7 @@ if(ARMCOMPUTEGPUFSA)
GpuFsaWorkloadFactory.hpp
)
- add_subdirectory(layerValidators)
+ add_subdirectory(layers)
add_subdirectory(workloads)
if(BUILD_UNIT_TESTS)
diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp
index 8ea9e8e7d3..9886a6e187 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.cpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -20,10 +20,7 @@
#include <arm_compute/core/CL/CLKernelLibrary.h>
#include <arm_compute/runtime/CL/CLBufferAllocator.h>
-#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
-#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
-
-#include "layerValidators/GpuFsaConvolution2dValidate.hpp"
+#include "layers/GpuFsaConvolution2d.hpp"
namespace armnn
{
@@ -218,9 +215,6 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra
OptimizationViews optimizationViews(modelOptions);
using namespace arm_compute::experimental::dynamic_fusion;
- // Create a new workload sketch, for validation purposes
- auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context();
- auto gpuCtx = GpuWorkloadContext(&compileCtx);
auto it = subgraph.end();
std::map<LayerGuid, Layer*> untouched;
@@ -233,32 +227,41 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra
GpuFsaLayerSupport supportChecker;
it = subgraph.end();
+ arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
+
+ // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
+ std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
while (it != subgraph.begin())
{
--it;
Layer& base = *(PolymorphicDowncast<Layer*>(*it));
+ // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
+ GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
+ preCompiledBlobPtr->workloadContext = workloadContext;
+ preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
- std::unique_ptr<GpuWorkloadSketch> sketch = std::make_unique<GpuWorkloadSketch>(&gpuCtx);
+ // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
switch (base.GetType())
{
case (LayerType::Convolution2d):
{
auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
- //std::vector<TensorInfo> infos = {input, weights};
auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
if (desc->m_BiasEnabled)
{
auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
- GpuFsaConvolution2dCreateOp(input,
+ GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
+ input,
*desc,
weights,
bias);
}
else
{
- GpuFsaConvolution2dCreateOp(input,
+ GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
+ input,
*desc,
weights,
EmptyOptional());
@@ -270,7 +273,8 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra
continue;
}
- auto compiledBlob = std::make_unique<PreCompiledObjectPtr>(sketch.release(), DeleteAsType<GpuWorkloadSketch>);
+ auto compiledBlob =
+ std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
@@ -289,7 +293,7 @@ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgra
CreateOutputsFrom(&base),
{&base});
- optimizationViews.AddSubstitution({ *substituteSubgraph, SubgraphView(preCompiledLayer) });
+ optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
untouched.erase(base.GetGuid());
}
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
index 26960065c7..4c2a5f02e3 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.hpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -11,6 +11,8 @@
#include <arm_compute/runtime/CL/CLMemoryRegion.h>
#include <arm_compute/core/CL/CLKernelLibrary.h>
#include <CL/cl_ext.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
// System includes for mapping and unmapping memory
#include <sys/mman.h>
@@ -18,13 +20,31 @@
namespace armnn
{
+/**
+ * A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend
+ *
+ * @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused.
+ * @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which contains TensorInfos
+ * @param[in, out] inputIds A unique pointer to a vector of input Ids used to access workloadContext TensorInfos
+ * @param[in, out] outputIds A unique pointer to a vector of output Ids used to access workloadContext TensorInfos
+ *
+ */
+struct GpuFsaPreCompiledBlob
+{
+ std::unique_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch> sketch = nullptr;
+ std::shared_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadContext> workloadContext = nullptr;
+
+ std::unique_ptr<std::vector<int32_t>> inputIds = nullptr;
+ std::unique_ptr<std::vector<int32_t>> outputIds = nullptr;
+};
+
// add new capabilities here..
const BackendCapabilities gpuFsaCapabilities("GpuFsa",
{
{"NonConstWeights", false},
{"AsyncExecution", false},
{"ProtectedContentAllocation", false},
- {"ConstantTensorsAsInputs", false},
+ {"ConstantTensorsAsInputs", true},
{"PreImportIOTensors", false},
{"ExternallyManagedMemory", false},
{"MultiAxisPacking", false},
diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp
index 063af2732e..96c986ab33 100644
--- a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp
+++ b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -10,7 +10,7 @@
#include <armnn/utility/PolymorphicDowncast.hpp>
#if defined(ARMCOMPUTEGPUFSA_ENABLED)
-#include "layerValidators/GpuFsaConvolution2dValidate.hpp"
+#include "layers/GpuFsaConvolution2d.hpp"
#endif
#include <vector>
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
index 6d13879f51..faa0d38386 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -9,6 +9,11 @@
#include "GpuFsaBackendId.hpp"
#include "GpuFsaTensorHandle.hpp"
+#include "workloads/GpuFsaConstantWorkload.hpp"
+#include "workloads/GpuFsaPreCompiledWorkload.hpp"
+
+#include <armnn/backends/MemCopyWorkload.hpp>
+
namespace armnn
{
@@ -43,11 +48,13 @@ bool IsDataType(const WorkloadInfo& info)
GpuFsaWorkloadFactory::GpuFsaWorkloadFactory(const std::shared_ptr<GpuFsaMemoryManager>& memoryManager)
: m_MemoryManager(memoryManager)
{
+ InitializeCLCompileContext();
}
GpuFsaWorkloadFactory::GpuFsaWorkloadFactory()
: m_MemoryManager(new GpuFsaMemoryManager())
{
+ InitializeCLCompileContext();
}
const BackendId& GpuFsaWorkloadFactory::GetBackendId() const
@@ -81,11 +88,52 @@ std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const T
return tensorHandle;
}
-std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/,
- const QueueDescriptor& /*descriptor*/,
- const WorkloadInfo& /*info*/) const
+
+void GpuFsaWorkloadFactory::InitializeCLCompileContext() {
+ // Initialize our m_CLCompileContext using default device and context
+ auto context = arm_compute::CLKernelLibrary::get().context();
+ auto device = arm_compute::CLKernelLibrary::get().get_device();
+ m_CLCompileContext = arm_compute::CLCompileContext(context, device);
+}
+
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType type,
+ const QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
{
- return nullptr;
+ switch(type)
+ {
+ case LayerType::Constant :
+ {
+ auto constQueueDescriptor = PolymorphicDowncast<const ConstantQueueDescriptor*>(&descriptor);
+ return std::make_unique<GpuFsaConstantWorkload>(*constQueueDescriptor, info, m_CLCompileContext);
+ }
+ case LayerType::Input :
+ {
+ auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor);
+ return std::make_unique<CopyMemGenericWorkload>(*inputQueueDescriptor, info);
+ }
+ case LayerType::Output :
+ {
+ auto outputQueueDescriptor = PolymorphicDowncast<const OutputQueueDescriptor*>(&descriptor);
+ return std::make_unique<CopyMemGenericWorkload>(*outputQueueDescriptor, info);
+ }
+ case LayerType::MemCopy :
+ {
+ auto memCopyQueueDescriptor = PolymorphicDowncast<const MemCopyQueueDescriptor*>(&descriptor);
+ if (memCopyQueueDescriptor->m_Inputs.empty() || !memCopyQueueDescriptor->m_Inputs[0])
+ {
+ throw InvalidArgumentException("GpuFsaWorkloadFactory: Invalid null input for MemCopy workload");
+ }
+ return std::make_unique<CopyMemGenericWorkload>(*memCopyQueueDescriptor, info);
+ }
+ case LayerType::PreCompiled :
+ {
+ auto precompiledQueueDescriptor = PolymorphicDowncast<const PreCompiledQueueDescriptor*>(&descriptor);
+ return std::make_unique<GpuFsaPreCompiledWorkload>(*precompiledQueueDescriptor, info);
+ }
+ default :
+ return nullptr;
+ }
}
} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
index 9b97070766..04074cf0ab 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -8,6 +8,8 @@
#include <armnn/Optional.hpp>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
namespace armnn
{
@@ -44,6 +46,7 @@ public:
std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
DataLayout dataLayout,
const bool IsMemoryManaged = true) const override;
+ void InitializeCLCompileContext();
std::unique_ptr<IWorkload> CreateWorkload(LayerType type,
const QueueDescriptor& descriptor,
@@ -54,6 +57,7 @@ private:
std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const;
mutable std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager;
+ arm_compute::CLCompileContext m_CLCompileContext;
};
} // namespace armnn
diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk
index d8d254205b..a219ad4fec 100644
--- a/src/backends/gpuFsa/backend.mk
+++ b/src/backends/gpuFsa/backend.mk
@@ -1,5 +1,5 @@
#
-# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
@@ -22,35 +22,29 @@ BACKEND_SOURCES := \
GpuFsaRegistryInitializer.cpp \
GpuFsaTensorHandleFactory.cpp \
GpuFsaWorkloadFactory.cpp \
- layerValidators/GpuFsaConvolution2dValidate.cpp
-else
-
-# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
-# No source file will be compiled for the GPU Dynamic Fusion backend
-
-BACKEND_SOURCES :=
-
-endif
+ layers/GpuFsaConvolution2d.cpp
# BACKEND_TEST_SOURCES contains the list of files to be included
# in the Android unit test build (armnn-tests) and it is picked
# up by the Android.mk file in the root of ArmNN
-# The variable to enable/disable the GPU Dynamic Fusion backend
-# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
-ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
-
# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
# Include the source files for the GPU Dynamic Fusion backend tests
BACKEND_TEST_SOURCES := \
- test/GpuFsaEndToEndTests.cpp \
+ test/GpuFsaEndToEndTests.cpp \
test/GpuFsaLayerSupportTests.cpp \
test/GpuFsaLayerTests.cpp \
test/GpuFsaOptimizedNetworkTests.cpp
else
# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
+# No source file will be compiled for the GPU Dynamic Fusion backend
+
+BACKEND_SOURCES :=
+
+
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
# No source file will be compiled for the GPU Dynamic Fusion backend tests
BACKEND_TEST_SOURCES :=
diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp
deleted file mode 100644
index bed7b26f74..0000000000
--- a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-//
-// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include "GpuFsaConvolution2dValidate.hpp"
-
-#include <armnn/Types.hpp>
-#include <armnn/utility/IgnoreUnused.hpp>
-
-#include <aclCommon/ArmComputeTensorUtils.hpp>
-
-#include <arm_compute/core/ITensorInfo.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/core/TensorShape.h>
-#include <arm_compute/core/CL/CLKernelLibrary.h>
-#include <arm_compute/core/CL/CLCompileContext.h>
-
-#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
-#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
-#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
-#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
-
-#include <vector>
-#include <iostream>
-
-namespace armnn
-{
-
-using namespace armcomputetensorutils;
-
-inline arm_compute::Status ValidateAndCreateOp(const TensorInfo& input,
- const Convolution2dDescriptor& descriptor,
- const TensorInfo& weights,
- const Optional<TensorInfo>& biases,
- const bool createOp = false)
-{
- // Create a new workload sketch, for validation purposes
- auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context();
- auto gpuCtx = GpuWorkloadContext(&compileCtx);
- GpuWorkloadSketch sketch{ &gpuCtx };
-
- // Build and create tensor infos using the sketch
- const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
- arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
- aclWeightsInfo.set_are_values_constant(weights.IsConstant());
-
- auto inputInfo = gpuCtx.create_tensor_info(aclInputInfo);
- auto weightInfo = gpuCtx.create_tensor_info(aclWeightsInfo);
-
- // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op
- arm_compute::TensorInfo aclBiasInfo;
- arm_compute::TensorInfo biasSketchInfo;
- arm_compute::TensorInfo* biasSketchInfoPtr = nullptr;
-
- if (descriptor.m_BiasEnabled)
- {
- if(!biases.has_value())
- {
- throw InvalidArgumentException("GpuFsaConvolution2dValidate: No biases set when biases are enabled");
- }
- aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout);
- aclBiasInfo.set_are_values_constant(biases.value().IsConstant());
-
- biasSketchInfo = gpuCtx.create_tensor_info(aclBiasInfo);
- biasSketchInfoPtr = &biasSketchInfo;
- }
-
- // Set Conv2d attributes using descriptor
- const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX,
- descriptor.m_DilationY);
- const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor);
- const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY);
-
- Conv2dAttributes conv2DAttributes{};
- conv2DAttributes.dilation(aclDilationInfo);
- conv2DAttributes.pad(aclPadInfo);
- conv2DAttributes.stride(aclStrideInfo);
-
- // Validate operator, check status and update reasonIfUnsupported
- arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch,
- &inputInfo,
- &weightInfo,
- biasSketchInfoPtr,
- conv2DAttributes);
-
- if (createOp)
- {
- const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
- if (!supported)
- {
- throw BackendCapabilityException("\"GpuFsa\" backend failed during operation validation when attempting "
- "to fuse a GpuConv2d operator into the existing workload sketch.");
- }
-
- arm_compute::ITensorInfo* convOutInfo = GpuConv2d::create_op(sketch,
- &inputInfo,
- &weightInfo,
- biasSketchInfoPtr,
- conv2DAttributes);
-
- // Temporary fix until fusing attempt is make for GpuFsa backend and Output layer workload is created.
- auto outputInfo = gpuCtx.create_tensor_info();
- GpuOutput::create_op(sketch, convOutInfo, &outputInfo);
- }
-
- return aclStatus;
-}
-
-arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input,
- const Convolution2dDescriptor& descriptor,
- const TensorInfo& weights,
- const Optional<TensorInfo>& biases)
-{
- return ValidateAndCreateOp(input, descriptor, weights, biases);
-}
-
-void GpuFsaConvolution2dCreateOp(const TensorInfo& input,
- const Convolution2dDescriptor& descriptor,
- const TensorInfo& weights,
- const Optional<TensorInfo>& biases)
-{
- ValidateAndCreateOp(input, descriptor, weights, biases, true);
-}
-
-} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/layerValidators/CMakeLists.txt b/src/backends/gpuFsa/layers/CMakeLists.txt
index 57ea41d56c..3a02ce1a77 100644
--- a/src/backends/gpuFsa/layerValidators/CMakeLists.txt
+++ b/src/backends/gpuFsa/layers/CMakeLists.txt
@@ -1,11 +1,11 @@
#
-# Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
list(APPEND armnnGpuFsaBackendLayerValidators_sources
- GpuFsaConvolution2dValidate.cpp
- GpuFsaConvolution2dValidate.hpp
+ GpuFsaConvolution2d.cpp
+ GpuFsaConvolution2d.hpp
)
add_library(armnnGpuFsaBackendLayerValidators OBJECT ${armnnGpuFsaBackendLayerValidators_sources})
diff --git a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp
new file mode 100644
index 0000000000..c7137d7ac8
--- /dev/null
+++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp
@@ -0,0 +1,180 @@
+//
+// Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaConvolution2d.hpp"
+
+#include <armnn/Types.hpp>
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/ITensorInfo.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
+#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
+
+#include <vector>
+#include <iostream>
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases)
+{
+ // Create a new workload sketch, for validation purposes
+ auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context();
+ auto workloadContext = GpuWorkloadContext(&compileCtx);
+ GpuWorkloadSketch sketch{ &workloadContext };
+
+ // Build and create tensor infos using the sketch
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+ aclWeightsInfo.set_are_values_constant(weights.IsConstant());
+
+ auto inputInfo = workloadContext.create_tensor_info(aclInputInfo);
+ auto weightInfo = workloadContext.create_tensor_info(aclWeightsInfo);
+
+ // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op
+ arm_compute::TensorInfo aclBiasInfo;
+ arm_compute::TensorInfo biasSketchInfo;
+ arm_compute::TensorInfo* biasSketchInfoPtr = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ if(!biases.has_value())
+ {
+ throw InvalidArgumentException("GpuFsaConvolution2d::ValidateOp: No biases set when biases are enabled");
+ }
+ aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout);
+ aclBiasInfo.set_are_values_constant(biases.value().IsConstant());
+
+ biasSketchInfo = workloadContext.create_tensor_info(aclBiasInfo);
+ biasSketchInfoPtr = &biasSketchInfo;
+ }
+
+ // Set Conv2d attributes using descriptor
+ const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX,
+ descriptor.m_DilationY);
+ const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor);
+ const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY);
+
+ Conv2dAttributes conv2DAttributes{};
+ conv2DAttributes.dilation(aclDilationInfo);
+ conv2DAttributes.pad(aclPadInfo);
+ conv2DAttributes.stride(aclStrideInfo);
+
+ // Validate operator, check status and update reasonIfUnsupported
+ arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch,
+ &inputInfo,
+ &weightInfo,
+ biasSketchInfoPtr,
+ conv2DAttributes);
+
+ return aclStatus;
+}
+
+void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob,
+ const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases)
+{
+/*
+ * Creating an Op for the GpuFds backend requires us to create and maintain quite a bit of data, which is then stored
+ * in a GpuFsaPreCompiledBlob for execution later. Specifically we need:
+ * GpuWorkloadContext, this contains the TensorInfos and is unique to the Graph being executed
+ * Sketch, this is similar to a subgraph and can contain one or more operations. Multiple ops can be "fused" together
+ * using a single sketch.
+ * The TensorInfoIds, these are the ids of the TensorInfos used when creating the sketch. They refer to the TensorInfos
+ * stored within the GpuWorkloadContext and are used to fetch them later when executing the sketch.
+ */
+ using namespace arm_compute::experimental::dynamic_fusion;
+ GpuWorkloadSketch* sketch = blob->sketch.get();
+ GpuWorkloadContext* workloadContext = blob->workloadContext.get();
+ std::vector<int32_t> inputIds = {};
+ std::vector<int32_t> outputIds = {};
+
+ // Build and create tensor infos using the sketch
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+ aclWeightsInfo.set_are_values_constant(weights.IsConstant());
+ auto inputInfo = workloadContext->create_tensor_info(aclInputInfo);
+ aclWeightsInfo.set_are_values_constant(weights.IsConstant());
+ inputIds.emplace_back(inputInfo.id());
+
+ auto weightInfo = workloadContext->create_tensor_info(aclWeightsInfo);
+ inputIds.emplace_back(weightInfo.id());
+
+ // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op
+ arm_compute::TensorInfo aclBiasInfo;
+ arm_compute::TensorInfo biasSketchInfo;
+ arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ if(!biases.has_value())
+ {
+ throw InvalidArgumentException("GpuFsaConvolution2d::CreateOp: No biases set when biases are enabled");
+ }
+ aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout);
+ aclBiasInfo.set_are_values_constant(biases.value().IsConstant());
+
+ biasSketchInfo = workloadContext->create_tensor_info(aclBiasInfo);
+ inputIds.emplace_back(biasSketchInfo.id());
+ biasSketchInfoPtr = workloadContext->implementation().get_tensor_info(biasSketchInfo.id());
+ }
+
+ // Set Conv2d attributes using descriptor
+ const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX,
+ descriptor.m_DilationY);
+ const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor);
+ const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY);
+
+ Conv2dAttributes conv2DAttributes{};
+ conv2DAttributes.dilation(aclDilationInfo);
+ conv2DAttributes.pad(aclPadInfo);
+ conv2DAttributes.stride(aclStrideInfo);
+
+ // Validate operator, check status and update reasonIfUnsupported
+ arm_compute::Status aclStatus =
+ GpuConv2d::validate_op(*sketch,
+ workloadContext->implementation().get_tensor_info(inputInfo.id()),
+ workloadContext->implementation().get_tensor_info(weightInfo.id()),
+ biasSketchInfoPtr,
+ conv2DAttributes);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported)
+ {
+ throw BackendCapabilityException("\"GpuFsa\" backend failed during Convolution2D operation validation");
+ }
+
+ arm_compute::ITensorInfo* convOutInfo =
+ GpuConv2d::create_op(*sketch,
+ workloadContext->implementation().get_tensor_info(inputInfo.id()),
+ workloadContext->implementation().get_tensor_info(weightInfo.id()),
+ biasSketchInfoPtr,
+ conv2DAttributes);
+
+ arm_compute::TensorInfo outputDstInfo = workloadContext->create_tensor_info();
+ outputIds.emplace_back(outputDstInfo.id());
+
+ GpuOutput::create_op(*sketch, convOutInfo, workloadContext->implementation().get_tensor_info(outputDstInfo.id()));
+ blob->inputIds = std::make_unique<std::vector<int32_t>>(inputIds);
+ blob->outputIds = std::make_unique<std::vector<int32_t>>(outputIds);
+}
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.hpp
index 120060e8ad..3346dc1028 100644
--- a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp
+++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -9,6 +9,7 @@
#include <arm_compute/core/Error.h>
#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
+#include <gpuFsa/GpuFsaBackend.hpp>
namespace armnn
{
@@ -20,9 +21,10 @@ arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input,
const TensorInfo& weights,
const Optional<TensorInfo>& biases);
-void GpuFsaConvolution2dCreateOp(const TensorInfo& input,
+void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob,
+ const TensorInfo& input,
const Convolution2dDescriptor& descriptor,
const TensorInfo& weights,
const Optional<TensorInfo>& biases);
-} // namespace armnn \ No newline at end of file
+} // namespace armnn
diff --git a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp
index 1d6b99a31f..c2cdd57574 100644
--- a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp
+++ b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp
@@ -1,8 +1,27 @@
//
-// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "backendsCommon/test/EndToEndTestImpl.hpp"
-#include <doctest/doctest.h> \ No newline at end of file
+#include "backendsCommon/test/Convolution2dEndToEndTestImpl.hpp"
+#include <doctest/doctest.h>
+
+TEST_SUITE("GpuFsaEndToEnd")
+{
+
+std::vector<BackendId> gpuFsaDefaultBackends = {"GpuFsa"};
+
+// Conv2d
+TEST_CASE("GpuFsaConv2dEndtoEndTestFloat32")
+{
+ Convolution2dEndToEnd<armnn::DataType::Float32>(gpuFsaDefaultBackends, armnn::DataLayout::NHWC);
+}
+
+TEST_CASE("GpuFsaConv2dWithoutBiasEndtoEndTestFloat32")
+{
+ Convolution2dEndToEnd<armnn::DataType::Float32>(gpuFsaDefaultBackends, armnn::DataLayout::NHWC, false);
+}
+
+}
diff --git a/src/backends/gpuFsa/workloads/CMakeLists.txt b/src/backends/gpuFsa/workloads/CMakeLists.txt
index 4d100123ea..9edc9e9d3c 100644
--- a/src/backends/gpuFsa/workloads/CMakeLists.txt
+++ b/src/backends/gpuFsa/workloads/CMakeLists.txt
@@ -1,10 +1,15 @@
#
-# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
list(APPEND armnnGpuFsaBackendWorkloads_sources
GpuFsaBaseWorkload.hpp
+ GpuFsaConstantWorkload.hpp
+ GpuFsaConstantWorkload.cpp
+ GpuFsaPreCompiledWorkload.hpp
+ GpuFsaPreCompiledWorkload.cpp
+ GpuFsaWorkloadUtils.hpp
)
add_library(armnnGpuFsaBackendWorkloads OBJECT ${armnnGpuFsaBackendWorkloads_sources})
diff --git a/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp
new file mode 100644
index 0000000000..39d3c0ddab
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.cpp
@@ -0,0 +1,114 @@
+//
+// Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaConstantWorkload.hpp"
+#include "GpuFsaWorkloadUtils.hpp"
+
+#include <Half.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <gpuFsa/GpuFsaTensorHandle.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+
+namespace armnn
+{
+
+arm_compute::Status GpuFsaConstantWorkloadValidate(const TensorInfo& output)
+{
+ const arm_compute::TensorInfo neonOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ std::array<arm_compute::DataType,8> supportedTypes = {
+ arm_compute::DataType::F16,
+ arm_compute::DataType::F32,
+ arm_compute::DataType::QASYMM8,
+ arm_compute::DataType::QASYMM8_SIGNED,
+ arm_compute::DataType::QSYMM16,
+ arm_compute::DataType::QSYMM8,
+ arm_compute::DataType::QSYMM8_PER_CHANNEL,
+ arm_compute::DataType::S32
+ };
+ auto it = std::find(begin(supportedTypes), end(supportedTypes), neonOutputInfo.data_type());
+
+ if (it != end(supportedTypes))
+ {
+ return arm_compute::Status{};
+ }
+ else
+ {
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported DataType"};
+ }
+}
+
+GpuFsaConstantWorkload::GpuFsaConstantWorkload(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ const arm_compute::CLCompileContext&)
+ : GpuFsaBaseWorkload<ConstantQueueDescriptor>(descriptor, info)
+ , m_RanOnce(false)
+{
+}
+
+void GpuFsaConstantWorkload::Execute() const
+{
+ // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data
+ // on the first inference, then reused for subsequent inferences.
+ // The initialisation cannot happen at workload construction time since the ACL kernel for the next layer may not
+ // have been configured at the time.
+ if (!m_RanOnce)
+ {
+ const ConstantQueueDescriptor& data = this->m_Data;
+
+ ARMNN_ASSERT(data.m_LayerOutput != nullptr);
+ arm_compute::CLTensor& output = static_cast<GpuFsaTensorHandle*>(data.m_Outputs[0])->GetTensor();
+ arm_compute::DataType computeDataType = static_cast<GpuFsaTensorHandle*>(data.m_Outputs[0])->GetDataType();
+
+ switch (computeDataType)
+ {
+ case arm_compute::DataType::F16:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<Half>());
+ break;
+ }
+ case arm_compute::DataType::F32:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<float>());
+ break;
+ }
+ case arm_compute::DataType::QASYMM8:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<uint8_t>());
+ break;
+ }
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int8_t>());
+ break;
+ }
+ case arm_compute::DataType::QSYMM16:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int16_t>());
+ break;
+ }
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int8_t>());
+ break;
+ }
+ case arm_compute::DataType::S32:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<int32_t>());
+ break;
+ }
+ default:
+ {
+ ARMNN_ASSERT_MSG(false, "Unknown data type");
+ break;
+ }
+ }
+
+ m_RanOnce = true;
+ }
+}
+
+} //namespace armnn
diff --git a/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp
new file mode 100644
index 0000000000..98b383b89f
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaConstantWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "GpuFsaBaseWorkload.hpp"
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
+namespace armnn
+{
+ arm_compute::Status GpuFsaConstantWorkloadValidate(const TensorInfo& output);
+
+ class GpuFsaConstantWorkload : public GpuFsaBaseWorkload<ConstantQueueDescriptor>
+ {
+ public:
+ GpuFsaConstantWorkload(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info,
+ const arm_compute::CLCompileContext& clCompileContext);
+
+ void Execute() const override;
+
+ private:
+ mutable bool m_RanOnce;
+ };
+
+} //namespace armnn
diff --git a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp
new file mode 100644
index 0000000000..20386b5d86
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp
@@ -0,0 +1,106 @@
+//
+// Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaPreCompiledWorkload.hpp"
+#include "GpuFsaWorkloadUtils.hpp"
+#include "armnn/utility/PolymorphicDowncast.hpp"
+
+#include <gpuFsa/GpuFsaTensorHandle.hpp>
+#include <gpuFsa/GpuFsaBackend.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/core/ITensorInfo.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
+#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
+
+namespace armnn {
+
+GpuFsaPreCompiledWorkload::GpuFsaPreCompiledWorkload(const PreCompiledQueueDescriptor &descriptor,
+ const WorkloadInfo &info)
+ : BaseWorkload<PreCompiledQueueDescriptor>(descriptor, info), m_workloadInfo(info)
+{
+ // Check that the workload is holding a pointer to a valid pre-compiled object
+ if (m_Data.m_PreCompiledObject == nullptr)
+ {
+ throw InvalidArgumentException(
+ "GpuFsaPrecompiledWorkload requires a valid pre-compiled object (GpuWorkloadSketch).");
+ }
+}
+
+void GpuFsaPreCompiledWorkload::Execute() const
+{
+/*
+ * The Execute function of the GpuFsa Backends PreCompiled workload needs to jump through various hoops in order to
+ * create a valid sketch and runtime that can execute the kernel
+ * First we need all of the data stored within the PreCompiled blob which was used to setup the workload, namely:
+ * The GpuWorkloadContext, this is a context which contains the TensorInfos and is unique to the graph being run
+ * The Sketch, this can contain one or many ops and acts as a subgraph within the context
+ * The TensorInfoIds, These are the ids of the TensorInfos used during the creation of the Sketch and stored within
+ * the context.
+ * It is very important that the Tensors passed into the Runtime being used to execute this sketch are created with
+ * the same TensorInfos as used when creating the sketch. We do this by creating new tensors, getting the original
+ * TensorInfos from the GpuWorkloadContext via their ids, and then importing the buffers from our own TensorHandles
+ * directly into these newly created Tensors. This allows us to link the externally visible Tensors from ArmNN to the
+ * Tensors which are needed to execute with the Sketch.
+ *
+ */
+ using namespace arm_compute::experimental::dynamic_fusion;
+ // Get the runtime and configure it with the precompiled sketch
+ ClWorkloadRuntime runtime;
+ GpuFsaPreCompiledBlob *preCompiledBlob = static_cast<GpuFsaPreCompiledBlob*>(m_Data.m_PreCompiledObject);
+ auto workloadContext =
+ &(preCompiledBlob->workloadContext->implementation());
+ auto sketch = preCompiledBlob->sketch.release();
+ std::vector<int32_t> inputIds = *(preCompiledBlob->inputIds.get());
+ std::vector<int32_t> outputIds = *(preCompiledBlob->outputIds.get());
+ auto status = runtime.configure(*sketch);
+
+ // (Important) Allocate auxiliary tensor memory if there are any
+ for(auto &data : runtime.get_auxiliary_tensors())
+ {
+ arm_compute::CLTensor* tensor = std::get<0>(data);
+ arm_compute::TensorInfo info = std::get<1>(data);
+ arm_compute::experimental::dynamic_fusion::AuxMemoryInfo aux_mem_req = std::get<2>(data);
+ tensor->allocator()->init(info, aux_mem_req.alignment);
+ tensor->allocator()->allocate(); // Use ACL allocated memory
+ }
+
+ // Create and initialize user tensors
+ std::vector<arm_compute::CLTensor*> inputsWeightsOutputs;
+ inputsWeightsOutputs.reserve(m_Data.m_Inputs.size() + m_Data.m_Outputs.size());
+
+ for (uint32_t inputSlotIdx = 0; inputSlotIdx < m_Data.m_Inputs.size(); ++inputSlotIdx)
+ {
+ arm_compute::CLTensor* input = new arm_compute::CLTensor{};
+ input->allocator()->init(*(dynamic_cast<arm_compute::TensorInfo*>(
+ workloadContext->get_tensor_info(inputIds[inputSlotIdx]))));
+ auto* inputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Inputs[inputSlotIdx]);
+ input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer());
+ inputsWeightsOutputs.emplace_back(std::move(input));
+ }
+ // Set the outputs
+ for (uint32_t outputSlotIdx = 0; outputSlotIdx < m_Data.m_Outputs.size(); ++outputSlotIdx)
+ {
+ arm_compute::CLTensor* output = new arm_compute::CLTensor{};
+ output->allocator()->init(*(dynamic_cast<arm_compute::TensorInfo*>(
+ workloadContext->get_tensor_info(outputIds[outputSlotIdx]))));
+ auto* outputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Outputs[outputSlotIdx]);
+ output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer());
+ inputsWeightsOutputs.emplace_back(std::move(output));
+ }
+ runtime.run(inputsWeightsOutputs);
+}
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp
new file mode 100644
index 0000000000..d29bf37e69
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.hpp
@@ -0,0 +1,56 @@
+//
+// Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "armnn/backends/Workload.hpp"
+
+#include <arm_compute/core/ITensorInfo.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
+#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace armnn
+{
+
+bool GpuFsaPreCompiledWorkloadValidate(std::string* reasonIfUnsupported);
+
+class GpuFsaPreCompiledWorkload : public BaseWorkload<PreCompiledQueueDescriptor>
+{
+public:
+ GpuFsaPreCompiledWorkload(const PreCompiledQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ bool SupportsTensorHandleReplacement() const override
+ {
+ return true;
+ }
+
+ void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+ {
+ this->m_Data.m_Inputs[slot] = tensorHandle;
+ }
+
+ void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+ {
+ this->m_Data.m_Outputs[slot] = tensorHandle;
+ }
+
+ WorkloadInfo m_workloadInfo;
+};
+
+} //namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp b/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp
new file mode 100644
index 0000000000..10954b07b5
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaWorkloadUtils.hpp
@@ -0,0 +1,163 @@
+//
+// Copyright © 2023-2024 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <BFloat16.hpp>
+#include <Half.hpp>
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+
+#include <armnn/Utils.hpp>
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include <sstream>
+
+
+namespace armnn
+{
+
+ inline std::string GetConvolutionMethodString(arm_compute::ConvolutionMethod& convolutionMethod)
+ {
+ switch (convolutionMethod)
+ {
+ case arm_compute::ConvolutionMethod::FFT:
+ return "FFT";
+ case arm_compute::ConvolutionMethod::DIRECT:
+ return "Direct";
+ case arm_compute::ConvolutionMethod::GEMM:
+ return "GEMM";
+ case arm_compute::ConvolutionMethod::WINOGRAD:
+ return "Winograd";
+ default:
+ return "Unknown";
+ }
+ }
+
+ template <typename T>
+ void CopyArmComputeClTensorData(arm_compute::CLTensor& dstTensor, const T* srcData)
+ {
+ {
+ dstTensor.map(true);
+ }
+
+ {
+ armcomputetensorutils::CopyArmComputeITensorData<T>(srcData, dstTensor);
+ }
+
+ dstTensor.unmap();
+ }
+
+ inline auto SetClStridedSliceData(const std::vector<int>& m_begin,
+ const std::vector<int>& m_end,
+ const std::vector<int>& m_stride)
+ {
+ arm_compute::Coordinates starts;
+ arm_compute::Coordinates ends;
+ arm_compute::Coordinates strides;
+
+ unsigned int num_dims = static_cast<unsigned int>(m_begin.size());
+
+ for (unsigned int i = 0; i < num_dims; i++) {
+ unsigned int revertedIndex = num_dims - i - 1;
+
+ starts.set(i, static_cast<int>(m_begin[revertedIndex]));
+ ends.set(i, static_cast<int>(m_end[revertedIndex]));
+ strides.set(i, static_cast<int>(m_stride[revertedIndex]));
+ }
+
+ return std::make_tuple(starts, ends, strides);
+ }
+
+ inline auto SetClSliceData(const std::vector<unsigned int>& m_begin,
+ const std::vector<unsigned int>& m_size)
+ {
+ // This function must translate the size vector given to an end vector
+ // expected by the ACL NESlice workload
+ arm_compute::Coordinates starts;
+ arm_compute::Coordinates ends;
+
+ unsigned int num_dims = static_cast<unsigned int>(m_begin.size());
+
+ // For strided slices, we have the relationship size = (end - begin) / stride
+ // For slice, we assume stride to be a vector of all ones, yielding the formula
+ // size = (end - begin) therefore we know end = size + begin
+ for (unsigned int i = 0; i < num_dims; i++)
+ {
+ unsigned int revertedIndex = num_dims - i - 1;
+
+ starts.set(i, static_cast<int>(m_begin[revertedIndex]));
+ ends.set(i, static_cast<int>(m_begin[revertedIndex] + m_size[revertedIndex]));
+ }
+
+ return std::make_tuple(starts, ends);
+ }
+
+ inline void InitializeArmComputeClTensorData(arm_compute::CLTensor& clTensor,
+ const ConstTensorHandle* handle)
+ {
+ ARMNN_ASSERT(handle);
+
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(clTensor);
+ switch(handle->GetTensorInfo().GetDataType())
+ {
+ case DataType::Float16:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::Half>());
+ break;
+ case DataType::Float32:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<float>());
+ break;
+ case DataType::QAsymmU8:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<uint8_t>());
+ break;
+ case DataType::QAsymmS8:
+ case DataType::QSymmS8:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int8_t>());
+ break;
+ case DataType::QSymmS16:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int16_t>());
+ break;
+ case DataType::Signed32:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int32_t>());
+ break;
+ case DataType::BFloat16:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::BFloat16>());
+ break;
+ default:
+ // Throw exception; assertion not called in release build.
+ throw Exception("Unexpected tensor type during InitializeArmComputeClTensorData().");
+ }
+ };
+
+ inline RuntimeException WrapClError(const cl::Error& clError, const CheckLocation& location)
+ {
+ std::stringstream message;
+ message << "CL error: " << clError.what() << ". Error code: " << clError.err();
+
+ return RuntimeException(message.str(), location);
+ }
+
+ inline void RunClFunction(arm_compute::IFunction& function, const CheckLocation& location)
+ {
+ try
+ {
+ function.run();
+ }
+ catch (cl::Error& error)
+ {
+ throw WrapClError(error, location);
+ }
+ }
+
+ template <typename DataType, typename PayloadType>
+ DataType* GetOutputTensorData(unsigned int idx, const PayloadType& data)
+ {
+ ITensorHandle* tensorHandle = data.m_Outputs[idx];
+ return reinterpret_cast<DataType*>(tensorHandle->Map());
+ }
+
+} //namespace armnn