From 8a570466aca7ae1619fe8fa715b68419fceb142f Mon Sep 17 00:00:00 2001 From: David Monahan Date: Wed, 22 Nov 2023 13:24:25 +0000 Subject: IVGCVSW-8157 - Rebase existing GpuFsa patches to 23.11 Squashed commit of the following: IVGCVSW-7159 Add GpuFsa backend skeleton IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL IVGCVSW-7381 Add IsLayerSupported implementation to GpuFsa backend IVGCVSW-7382 Implementation of Conv2d within GpuFsa Signed-off-by: James Conroy Signed-off-by: Matthew Sloyan Signed-off-by: David Monahan Change-Id: Id23d9ee598535de7b38a99ca223cdf0ad2102cef --- src/backends/gpuFsa/CMakeLists.txt | 44 +++ src/backends/gpuFsa/GpuFsaBackend.cpp | 310 ++++++++++++++++++ src/backends/gpuFsa/GpuFsaBackend.hpp | 285 ++++++++++++++++ src/backends/gpuFsa/GpuFsaBackendContext.cpp | 233 +++++++++++++ src/backends/gpuFsa/GpuFsaBackendContext.hpp | 47 +++ .../gpuFsa/GpuFsaBackendDefaultAllocator.hpp | 51 +++ src/backends/gpuFsa/GpuFsaBackendId.hpp | 12 + src/backends/gpuFsa/GpuFsaContextControl.cpp | 169 ++++++++++ src/backends/gpuFsa/GpuFsaContextControl.hpp | 42 +++ src/backends/gpuFsa/GpuFsaLayerSupport.cpp | 111 +++++++ src/backends/gpuFsa/GpuFsaLayerSupport.hpp | 24 ++ src/backends/gpuFsa/GpuFsaMemoryManager.cpp | 120 +++++++ src/backends/gpuFsa/GpuFsaMemoryManager.hpp | 59 ++++ src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp | 21 ++ src/backends/gpuFsa/GpuFsaTensorHandle.cpp | 188 +++++++++++ src/backends/gpuFsa/GpuFsaTensorHandle.hpp | 361 +++++++++++++++++++++ src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp | 112 +++++++ src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp | 55 ++++ src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp | 91 ++++++ src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp | 59 ++++ src/backends/gpuFsa/backend.cmake | 15 + src/backends/gpuFsa/backend.mk | 58 ++++ src/backends/gpuFsa/layerValidators/CMakeLists.txt | 14 + .../GpuFsaConvolution2dValidate.cpp | 126 +++++++ .../GpuFsaConvolution2dValidate.hpp | 28 ++ src/backends/gpuFsa/test/CMakeLists.txt | 19 ++ .../gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp | 193 +++++++++++ src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp | 8 + .../gpuFsa/test/GpuFsaLayerSupportTests.cpp | 64 ++++ src/backends/gpuFsa/test/GpuFsaLayerTests.cpp | 12 + .../gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp | 137 ++++++++ .../gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp | 45 +++ src/backends/gpuFsa/workloads/CMakeLists.txt | 16 + .../gpuFsa/workloads/GpuFsaBaseWorkload.hpp | 39 +++ 34 files changed, 3168 insertions(+) create mode 100644 src/backends/gpuFsa/CMakeLists.txt create mode 100644 src/backends/gpuFsa/GpuFsaBackend.cpp create mode 100644 src/backends/gpuFsa/GpuFsaBackend.hpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.cpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.hpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendId.hpp create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.cpp create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.hpp create mode 100644 src/backends/gpuFsa/GpuFsaLayerSupport.cpp create mode 100644 src/backends/gpuFsa/GpuFsaLayerSupport.hpp create mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.cpp create mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.hpp create mode 100644 src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp create mode 100644 src/backends/gpuFsa/GpuFsaTensorHandle.cpp create mode 100644 src/backends/gpuFsa/GpuFsaTensorHandle.hpp create mode 100644 src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp create mode 100644 src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp create mode 100644 src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp create mode 100644 src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp create mode 100644 src/backends/gpuFsa/backend.cmake create mode 100644 src/backends/gpuFsa/backend.mk create mode 100644 src/backends/gpuFsa/layerValidators/CMakeLists.txt create mode 100644 src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp create mode 100644 src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp create mode 100644 src/backends/gpuFsa/test/CMakeLists.txt create mode 100644 src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaLayerTests.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp create mode 100644 src/backends/gpuFsa/workloads/CMakeLists.txt create mode 100644 src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp (limited to 'src/backends/gpuFsa') diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt new file mode 100644 index 0000000000..8d1a58ee27 --- /dev/null +++ b/src/backends/gpuFsa/CMakeLists.txt @@ -0,0 +1,44 @@ +# +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +if(ARMCOMPUTEGPUFSA) + list(APPEND armnnGpuFsaBackend_sources + GpuFsaBackend.cpp + GpuFsaBackend.hpp + GpuFsaBackendContext.cpp + GpuFsaBackendContext.hpp + GpuFsaBackendDefaultAllocator.hpp + GpuFsaBackendId.hpp + GpuFsaContextControl.cpp + GpuFsaContextControl.hpp + GpuFsaLayerSupport.cpp + GpuFsaLayerSupport.hpp + GpuFsaRegistryInitializer.cpp + GpuFsaTensorHandle.hpp + GpuFsaTensorHandleFactory.cpp + GpuFsaTensorHandleFactory.hpp + GpuFsaWorkloadFactory.cpp + GpuFsaWorkloadFactory.hpp + ) + + add_subdirectory(layerValidators) + add_subdirectory(workloads) + + if(BUILD_UNIT_TESTS) + add_subdirectory(test) + endif() + +else() + list(APPEND armnnGpuFsaBackend_sources + GpuFsaBackendId.hpp + GpuFsaLayerSupport.cpp + GpuFsaLayerSupport.hpp + ) +endif() + +add_library(armnnGpuFsaBackend OBJECT ${armnnGpuFsaBackend_sources}) +target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn) +target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) +target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/backends) \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp new file mode 100644 index 0000000000..8ea9e8e7d3 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackend.cpp @@ -0,0 +1,310 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaBackend.hpp" +#include "GpuFsaBackendContext.hpp" +#include "GpuFsaBackendDefaultAllocator.hpp" +#include "GpuFsaBackendId.hpp" +#include "GpuFsaLayerSupport.hpp" +#include "GpuFsaTensorHandleFactory.hpp" +#include "GpuFsaWorkloadFactory.hpp" + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "layerValidators/GpuFsaConvolution2dValidate.hpp" + +namespace armnn +{ + +template +inline void DeleteAsType(const void* const blob) +{ + delete static_cast(blob); +} + +inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer) +{ + SubgraphView::InputSlots result; + for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it) + { + result.push_back(&(*it)); + } + return result; +} + +inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer) +{ + SubgraphView::OutputSlots result; + for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it) + { + result.push_back(&(*it)); + } + return result; +} + +inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs, + SubgraphView::OutputSlots&& outputs, + SubgraphView::Layers&& layers) +{ + return std::make_unique(std::move(inputs), std::move(outputs), std::move(layers)); +} + +const BackendId& GpuFsaBackend::GetIdStatic() +{ + static const BackendId s_Id{GpuFsaBackendId()}; + return s_Id; +} + +IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const +{ + if (m_UsingCustomAllocator) + { + return std::make_unique(m_CustomAllocator); + } + return std::make_unique(std::make_unique()); +} + +IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( + const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const +{ + return std::make_unique(PolymorphicPointerDowncast(memoryManager)); +} + +IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( + TensorHandleFactoryRegistry& registry) const +{ + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); + + return std::make_unique(PolymorphicPointerDowncast(memoryManager)); +} + +IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( + TensorHandleFactoryRegistry& registry, + const ModelOptions&, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const +{ + + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } + + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); + + return std::make_unique(PolymorphicPointerDowncast(memoryManager)); +} + +std::vector GpuFsaBackend::GetHandleFactoryPreferences() const +{ + return std::vector { GpuFsaTensorHandleFactory::GetIdStatic() }; +} + +void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) +{ + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); + +} + +void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) +{ + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } + + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); +} + +IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const +{ + return IBackendContextPtr{new GpuFsaBackendContext{options}}; +} + +IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext( + const IRuntime::CreationOptions&, IBackendProfilingPtr&) +{ + return IBackendProfilingContextPtr{}; +} + +IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const +{ + static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport}; + return layerSupport; +} + +std::unique_ptr GpuFsaBackend::GetDefaultAllocator() const +{ + return std::make_unique(); +} + +OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const +{ + OptimizationViews optimizationViews(modelOptions); + + using namespace arm_compute::experimental::dynamic_fusion; + // Create a new workload sketch, for validation purposes + auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); + auto gpuCtx = GpuWorkloadContext(&compileCtx); + + auto it = subgraph.end(); + std::map untouched; + while (it != subgraph.begin()) + { + --it; + Layer& base = *(PolymorphicDowncast(*it)); + untouched.insert({base.GetGuid(), &base}); + } + + GpuFsaLayerSupport supportChecker; + it = subgraph.end(); + while (it != subgraph.begin()) + { + --it; + Layer& base = *(PolymorphicDowncast(*it)); + + std::unique_ptr sketch = std::make_unique(&gpuCtx); + switch (base.GetType()) + { + case (LayerType::Convolution2d): + { + auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); + //std::vector infos = {input, weights}; + + auto desc = PolymorphicDowncast(&base.GetParameters()); + if (desc->m_BiasEnabled) + { + auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo(); + GpuFsaConvolution2dCreateOp(input, + *desc, + weights, + bias); + } + else + { + GpuFsaConvolution2dCreateOp(input, + *desc, + weights, + EmptyOptional()); + } + break; + } + default: + // unsupported layer for GpuFsa backend + continue; + } + + auto compiledBlob = std::make_unique(sketch.release(), DeleteAsType); + + IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer( + PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()), + std::move(*compiledBlob), + armnn::Optional(GetId()), + "GpuFsa_Pre_Compiled_Layer"); + + // Copy the output tensor infos from sub-graph + for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++) + { + preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo()); + } + + SubgraphView::SubgraphViewPtr substituteSubgraph = + CreateSubgraphViewFrom(CreateInputsFrom(&base), + CreateOutputsFrom(&base), + {&base}); + + optimizationViews.AddSubstitution({ *substituteSubgraph, SubgraphView(preCompiledLayer) }); + + untouched.erase(base.GetGuid()); + } + + if (optimizationViews.GetSubstitutions().empty()) + { + optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + } + else + { + ReportUntouchedLayers(optimizationViews, untouched); + } + + + return optimizationViews; +} + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp new file mode 100644 index 0000000000..26960065c7 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackend.hpp @@ -0,0 +1,285 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +#include +#include +#include +#include + +// System includes for mapping and unmapping memory +#include + +namespace armnn +{ + +// add new capabilities here.. +const BackendCapabilities gpuFsaCapabilities("GpuFsa", + { + {"NonConstWeights", false}, + {"AsyncExecution", false}, + {"ProtectedContentAllocation", false}, + {"ConstantTensorsAsInputs", false}, + {"PreImportIOTensors", false}, + {"ExternallyManagedMemory", false}, + {"MultiAxisPacking", false}, + {"SingleAxisPacking", false} + }); + +class GpuFsaBackend : public IBackendInternal +{ +public: + GpuFsaBackend() : m_CustomAllocator(nullptr) {}; + GpuFsaBackend(std::shared_ptr allocator) + { + UseCustomMemoryAllocator(allocator, armnn::EmptyOptional()); + } + ~GpuFsaBackend() = default; + + static const BackendId& GetIdStatic(); + const BackendId& GetId() const override { return GetIdStatic(); } + + IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override; + + IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory( + const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override; + + IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override; + + IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const override; + + std::vector GetHandleFactoryPreferences() const override; + + void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override; + + void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) override; + + IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override; + IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext( + const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override; + + IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override; + + OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const override; + + std::unique_ptr GetDefaultAllocator() const override; + + BackendCapabilities GetCapabilities() const override + { + return gpuFsaCapabilities; + }; + + virtual bool UseCustomMemoryAllocator(std::shared_ptr allocator, + armnn::Optional) override + { + ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend"; + + // Set flag to signal the backend to use a custom memory allocator + m_CustomAllocator = std::make_shared(std::move(allocator)); + m_UsingCustomAllocator = true; + return m_UsingCustomAllocator; + } + + // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this + class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator + { + public: + GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr alloc) : m_CustomAllocator(alloc) + {} + // Inherited methods overridden: + void* allocate(size_t size, size_t alignment) override + { + auto alloc = m_CustomAllocator->allocate(size, alignment); + return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType()); + } + void free(void* ptr) override + { + auto hostMemPtr = m_AllocatedBufferMappings[ptr]; + clReleaseMemObject(static_cast(ptr)); + m_CustomAllocator->free(hostMemPtr); + } + std::unique_ptr make_region(size_t size, size_t alignment) override + { + auto hostMemPtr = m_CustomAllocator->allocate(size, alignment); + cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType()); + + return std::make_unique(cl::Buffer(buffer), + hostMemPtr, + m_CustomAllocator->GetMemorySourceType()); + } + private: + cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source) + { + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment); + + if (source == MemorySource::Malloc) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_HOST_ARM, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error)); + } + else if (source == MemorySource::DmaBuf) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_DMA_BUF_ARM, + CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM, + CL_TRUE, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + + std::to_string(error)); + } + else if (source == MemorySource::DmaBufProtected) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_DMA_BUF_ARM, + CL_IMPORT_TYPE_PROTECTED_ARM, + CL_TRUE, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + + std::to_string(error)); + } + throw armnn::Exception( + "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator"); + } + std::shared_ptr m_CustomAllocator; + std::map m_AllocatedBufferMappings; + }; + + class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion + { + public: + // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access + ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source) + : ICLMemoryRegion(buffer.getInfo()) + { + _mem = buffer; + m_HostMemPtr = hostMemPtr; + m_MemorySource = source; + } + + // Inherited methods overridden : + void* ptr() override + { + return nullptr; + } + + void* map(cl::CommandQueue &q, bool blocking) override + { + armnn::IgnoreUnused(q, blocking); + if (m_HostMemPtr == nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr"); + } + if (_mapping != nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped"); + } + switch (m_MemorySource) + { + case armnn::MemorySource::Malloc: + _mapping = m_HostMemPtr; + return _mapping; + break; + case armnn::MemorySource::DmaBuf: + case armnn::MemorySource::DmaBufProtected: + // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd + _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast(m_HostMemPtr)), 0); + return _mapping; + break; + default: + throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source"); + break; + } + } + + void unmap(cl::CommandQueue &q) override + { + armnn::IgnoreUnused(q); + switch (m_MemorySource) + { + case armnn::MemorySource::Malloc: + _mapping = nullptr; + break; + case armnn::MemorySource::DmaBuf: + case armnn::MemorySource::DmaBufProtected: + munmap(_mapping, _size); + _mapping = nullptr; + break; + default: + throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source"); + break; + } + } + private: + void* m_HostMemPtr = nullptr; + armnn::MemorySource m_MemorySource; + }; + + std::shared_ptr m_CustomAllocator; + bool m_UsingCustomAllocator = false; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.cpp b/src/backends/gpuFsa/GpuFsaBackendContext.cpp new file mode 100644 index 0000000000..84b948303a --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendContext.cpp @@ -0,0 +1,233 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaBackendContext.hpp" +#include "GpuFsaContextControl.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace armnn +{ + +struct GpuFsaBackendContext::GpuFsaContextControlWrapper +{ + GpuFsaContextControlWrapper(arm_compute::CLTuner* tuner, + arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle, + bool profilingEnabled) + : m_GpuFsaContextControl(tuner, heuristicsHandle, profilingEnabled) + {} + + bool Sync() + { + if (arm_compute::CLScheduler::get().context()() != NULL) + { + // Waits for all queued CL requests to finish before unloading the network they may be using. + try + { + // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error. + arm_compute::CLScheduler::get().sync(); + } + catch (const cl::Error& err) + { + ARMNN_LOG(warning) << "Runtime::UnloadNetwork(): an error occurred while waiting for " + "the queued CL requests to finish"; + throw err; + } + } + + return true; + } + + void ClearClCache() + { + if (arm_compute::CLScheduler::get().context()() != NULL) + { + // There are no loaded networks left, so clear the CL cache to free up memory + m_GpuFsaContextControl.ClearClCache(); + } + } + + GpuFsaContextControl m_GpuFsaContextControl; +}; + +GpuFsaBackendContext::GpuFsaBackendContext(const IRuntime::CreationOptions& options) + : IBackendContext(options) + , m_TuningFile() +{ + bool kernelProfiling = options.m_EnableGpuProfiling; + + arm_compute::CLTuner* tuner = nullptr; + arm_compute::CLGEMMHeuristicsHandle* mlgoTuner = nullptr; + bool useLegacyTunerAPI = options.m_GpuAccTunedParameters.get() != nullptr; + if (useLegacyTunerAPI) + { + auto clTunerParams = PolymorphicDowncast( + options.m_GpuAccTunedParameters.get()); + tuner = &clTunerParams->m_Tuner; + + if (tuner) + { + auto ConvertTuningLevel = [](IGpuAccTunedParameters::TuningLevel level, + armnn::IGpuAccTunedParameters::Mode mode) + { + if (mode == armnn::IGpuAccTunedParameters::Mode::UseTunedParameters) + { + return TuningLevel::None; + } + + switch(level) + { + case IGpuAccTunedParameters::TuningLevel::Rapid: + return TuningLevel::Rapid; + case IGpuAccTunedParameters::TuningLevel::Normal: + return TuningLevel::Normal; + case IGpuAccTunedParameters::TuningLevel::Exhaustive: + return TuningLevel::Exhaustive; + default: + { + ARMNN_LOG(warning) << "Tuning level not recognised."; + return TuningLevel::None; + } + } + }; + + TuningLevel tuningLevel = ConvertTuningLevel(clTunerParams->m_TuningLevel, clTunerParams->m_Mode); + ConfigureTuner(*tuner, tuningLevel); + } + } + else //New backend options API + { + const TuningLevel defaultTuningLevel = TuningLevel::None; + auto tuningLevel = defaultTuningLevel; + + ParseOptions(options.m_BackendOptions, "GpuFsa", [&](std::string name, const BackendOptions::Var& value) + { + if (name == "KernelProfilingEnabled") + { + kernelProfiling |= ParseBooleanBackendOption(value, false); + } else if (name == "TuningFile") + { + m_TuningFile = ParseStringBackendOption(value, ""); + } else if (name == "TuningLevel") + { + tuningLevel = ParseTuningLevel(value, defaultTuningLevel); + } + else if (name == "MLGOTuningFilePath") + { + m_MLGOTuningFile = ParseStringBackendOption(value, ""); + } + }); + + // Create the tuner, in tuning mode initially. + m_Tuner = std::make_unique(true); + + ConfigureTuner(*(m_Tuner.get()), tuningLevel); + + if (!m_TuningFile.empty()) + { + try + { + ARMNN_LOG(info) << "Loading Gpu tuning data from file: " << m_TuningFile; + m_Tuner->load_from_file(m_TuningFile.c_str()); + } + catch (const std::exception& e) + { + // Warn if not tuning, otherwise tuning will generate new params + if (tuningLevel == TuningLevel::None) + { + ARMNN_LOG(warning) << "Could not load GpuFsa tuner data file."; + } + } + } + + if (!m_MLGOTuningFile.empty()) + { + try + { + ARMNN_LOG(info) << "Loading Gpu MLGO tuning data from file: " << m_TuningFile; + if(m_MLGOTuner.reload_from_file(m_MLGOTuningFile.c_str())) + { + mlgoTuner = &m_MLGOTuner; + } + } + catch (const std::exception& e) + { + ARMNN_LOG(warning) << "Could not load GpuFsa MLGO tuner data file."; + } + } + + tuner = m_Tuner.get(); + } + + m_GpuFsaContextControlWrapper = std::make_unique( + tuner, + mlgoTuner, + kernelProfiling + ); +} + +bool GpuFsaBackendContext::BeforeLoadNetwork(NetworkId) +{ + return true; +} + +bool GpuFsaBackendContext::AfterLoadNetwork(NetworkId networkId) +{ + { + std::lock_guard lockGuard(m_Mutex); + m_NetworkIds.insert(networkId); + } + return true; +} + +bool GpuFsaBackendContext::BeforeUnloadNetwork(NetworkId) +{ + return m_GpuFsaContextControlWrapper->Sync(); +} + +bool GpuFsaBackendContext::AfterUnloadNetwork(NetworkId networkId) +{ + bool clearCache = false; + { + std::lock_guard lockGuard(m_Mutex); + m_NetworkIds.erase(networkId); + clearCache = m_NetworkIds.empty(); + } + + if (clearCache) + { + m_GpuFsaContextControlWrapper->ClearClCache(); + } + + return true; +} + +bool GpuFsaBackendContext::AfterEnqueueWorkload(NetworkId) +{ + return m_GpuFsaContextControlWrapper->Sync(); +} + +GpuFsaBackendContext::~GpuFsaBackendContext() +{ + if (m_Tuner && !m_TuningFile.empty()) + { + try + { + m_Tuner->save_to_file(m_TuningFile.c_str()); + } + catch(const std::exception& e) + { + ARMNN_LOG(warning) << "Could not save GpuFsa tuner data to file " << m_TuningFile; + } + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.hpp b/src/backends/gpuFsa/GpuFsaBackendContext.hpp new file mode 100644 index 0000000000..271688fd99 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendContext.hpp @@ -0,0 +1,47 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include +#include + +#include +#include + +namespace armnn +{ + +class GpuFsaBackendContext : public IBackendContext +{ +public: + GpuFsaBackendContext(const IRuntime::CreationOptions& options); + + bool BeforeLoadNetwork(NetworkId networkId) override; + bool AfterLoadNetwork(NetworkId networkId) override; + + bool BeforeUnloadNetwork(NetworkId networkId) override; + bool AfterUnloadNetwork(NetworkId networkId) override; + + bool AfterEnqueueWorkload(NetworkId networkId) override; + + ~GpuFsaBackendContext() override; + +private: + std::mutex m_Mutex; + struct GpuFsaContextControlWrapper; + std::unique_ptr m_GpuFsaContextControlWrapper; + + std::unordered_set m_NetworkIds; + + std::unique_ptr m_Tuner; + std::string m_TuningFile; + +protected: + arm_compute::CLGEMMHeuristicsHandle m_MLGOTuner; + std::string m_MLGOTuningFile; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp new file mode 100644 index 0000000000..c57ff63b92 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp @@ -0,0 +1,51 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +#include +#include + +namespace armnn +{ + +/** +* Default Memory Allocator class returned from IBackendInternal::GetDefaultAllocator(MemorySource) +*/ +class GpuFsaBackendDefaultAllocator : public ICustomAllocator +{ +public: + GpuFsaBackendDefaultAllocator() = default; + + void* allocate(size_t size, size_t alignment = 0) override + { + IgnoreUnused(alignment); + cl_mem buf{ clCreateBuffer(arm_compute::CLScheduler::get().context().get(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + size, + nullptr, + nullptr)}; + return static_cast(buf); + } + + void free(void* ptr) override + { + ARM_COMPUTE_ERROR_ON(ptr == nullptr); + clReleaseMemObject(static_cast(ptr)); + } + + MemorySource GetMemorySourceType() override + { + return MemorySource::Gralloc; + } + + void* GetMemoryRegionAtOffset(void* buffer, size_t offset, size_t alignment = 0) override + { + IgnoreUnused(alignment); + return static_cast(buffer) + offset; + } +}; +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackendId.hpp b/src/backends/gpuFsa/GpuFsaBackendId.hpp new file mode 100644 index 0000000000..1231798bf0 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendId.hpp @@ -0,0 +1,12 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +namespace armnn +{ + +constexpr const char * GpuFsaBackendId() { return "GpuFsa"; } + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaContextControl.cpp b/src/backends/gpuFsa/GpuFsaContextControl.cpp new file mode 100644 index 0000000000..cc53356c0d --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaContextControl.cpp @@ -0,0 +1,169 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaContextControl.hpp" + +#include +#include +#include + +#include +#include + +#include + +namespace cl +{ +class Context; +class CommandQueue; +class Device; +} + +namespace armnn +{ + +GpuFsaContextControl::GpuFsaContextControl(arm_compute::CLTuner *tuner, + arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle, + bool profilingEnabled) + : m_Tuner(tuner) + , m_HeuristicsHandle(heuristicsHandle) + , m_ProfilingEnabled(profilingEnabled) +{ + try + { + std::vector platforms; + cl::Platform::get(&platforms); + + // Selects default platform for the first element. + cl::Platform::setDefault(platforms[0]); + + std::vector devices; + platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); + + // Selects default device for the first element. + cl::Device::setDefault(devices[0]); + } + catch (const cl::Error& clError) + { + throw ClRuntimeUnavailableException(fmt::format( + "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}", + clError.what(), clError.err())); + } + + // Removes the use of global CL context. + cl::Context::setDefault(cl::Context{}); + if (cl::Context::getDefault()() != NULL) + { + throw armnn::Exception("GpuFsaContextControl: Unable to remove the global CL context"); + } + + // Removes the use of global CL command queue. + cl::CommandQueue::setDefault(cl::CommandQueue{}); + if (cl::CommandQueue::getDefault()() != NULL) + { + throw armnn::Exception("GpuFsaContextControl: Unable to remove the global CL command queue"); + } + + // Always load the OpenCL runtime. + LoadOpenClRuntime(); +} + +GpuFsaContextControl::~GpuFsaContextControl() +{ + // Load the OpencCL runtime without the tuned parameters to free the memory for them. + try + { + UnloadOpenClRuntime(); + } + catch (const cl::Error& clError) + { + // This should not happen, it is ignored if it does. + + // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an + // exception of type std::length_error. + // Using stderr instead in this context as there is no point in nesting try-catch blocks here. + std::cerr << "A CL error occurred unloading the runtime tuner parameters: " + << clError.what() << ". CL error code is: " << clError.err() << std::endl; + } +} + +void GpuFsaContextControl::LoadOpenClRuntime() +{ + DoLoadOpenClRuntime(true); +} + +void GpuFsaContextControl::UnloadOpenClRuntime() +{ + DoLoadOpenClRuntime(false); +} + +void GpuFsaContextControl::DoLoadOpenClRuntime(bool updateTunedParameters) +{ + cl::Device device = cl::Device::getDefault(); + cl::Context context; + cl::CommandQueue commandQueue; + + if (arm_compute::CLScheduler::get().is_initialised() && arm_compute::CLScheduler::get().context()() != NULL) + { + // Wait for all queued CL requests to finish before reinitialising it. + arm_compute::CLScheduler::get().sync(); + } + + try + { + arm_compute::CLKernelLibrary::get().clear_programs_cache(); + // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no + // context references); it is initialised again, with a proper context, later. + arm_compute::CLScheduler::get().init(context, commandQueue, device); + arm_compute::CLKernelLibrary::get().init(".", context, device); + + { + // + // Here we replace the context with a new one in which + // the memory leak checks show it as an extra allocation but + // because of the scope of the leak checks, it doesn't count + // the disposal of the original object. On the other hand it + // does count the creation of this context which it flags + // as a memory leak. By adding the following line we prevent + // this to happen. + // + ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE(); + context = cl::Context(device); + } + + // NOTE: In this specific case profiling has to be enabled on the command queue + // in order for the CLTuner to work. + bool profilingNeededForClTuner = updateTunedParameters && m_Tuner && + m_Tuner->tune_new_kernels(); + + if (m_ProfilingEnabled || profilingNeededForClTuner) + { + // Create a new queue with profiling enabled. + commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); + } + else + { + // Use default queue. + commandQueue = cl::CommandQueue(context, device); + } + } + catch (const cl::Error& clError) + { + throw ClRuntimeUnavailableException(fmt::format( + "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}", + clError.what(), clError.err())); + } + + // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute. + arm_compute::CLKernelLibrary::get().init(".", context, device); + arm_compute::CLScheduler::get().init(context, commandQueue, device, m_Tuner, m_HeuristicsHandle); +} + +void GpuFsaContextControl::ClearClCache() +{ + DoLoadOpenClRuntime(true); +} + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaContextControl.hpp b/src/backends/gpuFsa/GpuFsaContextControl.hpp new file mode 100644 index 0000000000..f77b1fbdd4 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaContextControl.hpp @@ -0,0 +1,42 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +namespace armnn +{ + +// ARM Compute OpenCL context control. +class GpuFsaContextControl +{ +public: + + GpuFsaContextControl(arm_compute::CLTuner* = nullptr, + arm_compute::CLGEMMHeuristicsHandle* = nullptr, + bool profilingEnabled = false); + + virtual ~GpuFsaContextControl(); + + void LoadOpenClRuntime(); + + // Users should call this (after freeing all of the cl::Context objects they use) + // to release the cached memory used by the compute library. + void UnloadOpenClRuntime(); + + // Clear the CL cache, without losing the tuned parameter settings. + void ClearClCache(); + +private: + + void DoLoadOpenClRuntime(bool updateTunedParameters); + + arm_compute::CLTuner* m_Tuner; + arm_compute::CLGEMMHeuristicsHandle* m_HeuristicsHandle; + + bool m_ProfilingEnabled; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp new file mode 100644 index 0000000000..063af2732e --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp @@ -0,0 +1,111 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaLayerSupport.hpp" + +#include +#include +#include + +#if defined(ARMCOMPUTEGPUFSA_ENABLED) +#include "layerValidators/GpuFsaConvolution2dValidate.hpp" +#endif + +#include + +namespace armnn +{ + +template +bool IsGpuFsaBackendSupported(Optional reasonIfUnsupported, Args... args) +{ + IgnoreUnused(reasonIfUnsupported, (args)...); +#if defined(ARMCOMPUTEGPUFSA_ENABLED) + return true; +#else + if (reasonIfUnsupported) + { + reasonIfUnsupported.value() = "The armnn library has been built without CL support"; + } + return false; +#endif +} + +#if defined(ARMCOMPUTEGPUFSA_ENABLED) +#define FORWARD_GPUFSA_LAYER_SUPPORT_FUNC(expr) (expr) +#else +#define FORWARD_GPUFSA_LAYER_SUPPORT_FUNC(expr) IsGpuFsaBackendSupported(reasonIfUnsupported) +#endif + +#if defined(ARMCOMPUTEGPUFSA_ENABLED) +template +inline bool CheckIsLayerSupported(FuncType&& func, Optional reasonIfUnsupported, Args&&... args) +{ + arm_compute::Status aclStatus = func(std::forward(args)...); + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + reasonIfUnsupported.value() = aclStatus.error_description(); + } + return supported; +} + +#define FORWARD_LAYER_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \ + return CheckIsLayerSupported(func, reasonIfUnsupported, __VA_ARGS__); +#else +#define FORWARD_LAYER_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \ + return IsGpuFsaBackendSupported(reasonIfUnsupported, __VA_ARGS__); +#endif + +bool GpuFsaLayerSupport::IsLayerSupported(const LayerType& type, + const std::vector& infos, + const BaseDescriptor& descriptor, + const Optional& lstmParamsInfo, + const Optional& quantizedLstmInputParamsInfo, + Optional reasonIfUnsupported) const +{ + IgnoreUnused(lstmParamsInfo); + IgnoreUnused(quantizedLstmInputParamsInfo); + + switch (type) { + case LayerType::Convolution2d: + { + if (infos.size() != 4) + { + throw InvalidArgumentException("Invalid number of Convolution2d TensorInfos. " + "TensorInfos should be of format: {input, output, weights, biases}."); + } + + auto desc = *(PolymorphicDowncast(&descriptor)); + if (infos[3] == TensorInfo()) + { + FORWARD_LAYER_VALIDATE_FUNC(GpuFsaConvolution2dValidate, + reasonIfUnsupported, + infos[0], + desc, + infos[2], + EmptyOptional()); + } + else + { + FORWARD_LAYER_VALIDATE_FUNC(GpuFsaConvolution2dValidate, + reasonIfUnsupported, + infos[0], + desc, + infos[2], + infos[3]); + } + } + case LayerType::Constant: + case LayerType::Input: + case LayerType::Output: + return IsGpuFsaBackendSupported(reasonIfUnsupported, infos[0]); + default: + // Layers not supported in the GpuFsa backend. + return false; + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.hpp b/src/backends/gpuFsa/GpuFsaLayerSupport.hpp new file mode 100644 index 0000000000..31177ec3c9 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaLayerSupport.hpp @@ -0,0 +1,24 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +namespace armnn +{ + +class GpuFsaLayerSupport : public ILayerSupport +{ +public: + bool IsLayerSupported(const LayerType& type, + const std::vector& infos, + const BaseDescriptor& descriptor, + const Optional& lstmParamsInfo, + const Optional&, + Optional reasonIfUnsupported) const override; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp new file mode 100644 index 0000000000..e16c02d18e --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp @@ -0,0 +1,120 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#include "GpuFsaMemoryManager.hpp" +#include "Exceptions.hpp" + +#include + +namespace armnn +{ + +GpuFsaMemoryManager::GpuFsaMemoryManager() +{} + +GpuFsaMemoryManager::~GpuFsaMemoryManager() +{} + +GpuFsaMemoryManager::Pool* GpuFsaMemoryManager::Manage(unsigned int numBytes) +{ + if (!m_FreePools.empty()) + { + Pool* res = m_FreePools.back(); + m_FreePools.pop_back(); + res->Reserve(numBytes); + return res; + } + else + { + m_Pools.push_front(Pool(numBytes)); + return &m_Pools.front(); + } +} + +void GpuFsaMemoryManager::Allocate(GpuFsaMemoryManager::Pool* pool) +{ + if (pool == nullptr) + { + throw armnn::MemoryValidationException( + "GpuFsaMemoryManager: Allocate: Attempting to allocate a null memory pool ptr"); + } + m_FreePools.push_back(pool); +} + +void* GpuFsaMemoryManager::GetPointer(GpuFsaMemoryManager::Pool* pool) +{ + return pool->GetPointer(); +} + +void GpuFsaMemoryManager::Acquire() +{ + for (Pool &pool: m_Pools) + { + pool.Acquire(); + } +} + +void GpuFsaMemoryManager::Release() +{ + for (Pool &pool: m_Pools) + { + pool.Release(); + } +} + +GpuFsaMemoryManager::Pool::Pool(unsigned int numBytes) + : m_Size(numBytes), + m_Pointer(nullptr) +{} + +GpuFsaMemoryManager::Pool::~Pool() +{ + if (m_Pointer) + { + Release(); + } +} + +void* GpuFsaMemoryManager::Pool::GetPointer() +{ + if (m_Pointer == nullptr) + { + throw armnn::MemoryValidationException( + "GpuFsaMemoryManager::Pool::GetPointer() called when memory not acquired"); + } + return m_Pointer; +} + +void GpuFsaMemoryManager::Pool::Reserve(unsigned int numBytes) +{ + if (m_Pointer != nullptr) + { + throw armnn::MemoryValidationException( + "GpuFsaMemoryManager::Pool::Reserve() cannot be called after memory acquired"); + } + m_Size = std::max(m_Size, numBytes); +} + +void GpuFsaMemoryManager::Pool::Acquire() +{ + if (m_Pointer != nullptr) + { + throw armnn::MemoryValidationException( + "GpuFsaMemoryManager::Pool::Acquire() called when memory already acquired"); + } + m_Pointer = ::operator new(size_t(m_Size)); +} + +void GpuFsaMemoryManager::Pool::Release() +{ + if (m_Pointer == nullptr) + { + throw armnn::MemoryValidationException( + "GpuFsaMemoryManager::Pool::Release() called when memory not acquired"); + } + ::operator delete(m_Pointer); + m_Pointer = nullptr; +} + +} \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp new file mode 100644 index 0000000000..f68273a786 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp @@ -0,0 +1,59 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +#include +#include + +namespace armnn +{ + +// A dummy MemoryManager which will be deleted once the GpuFsa Backend is integrated with ClMemoryManager +class GpuFsaMemoryManager : public IMemoryManager +{ +public: + GpuFsaMemoryManager(); + virtual ~GpuFsaMemoryManager(); + + class Pool; + + Pool* Manage(unsigned int numBytes); + + void Allocate(Pool *pool); + + void* GetPointer(Pool *pool); + + void Acquire() override; + void Release() override; + + class Pool + { + public: + Pool(unsigned int numBytes); + ~Pool(); + + void Acquire(); + void Release(); + + void* GetPointer(); + + void Reserve(unsigned int numBytes); + + private: + unsigned int m_Size; + void* m_Pointer; + }; + +private: + GpuFsaMemoryManager(const GpuFsaMemoryManager&) = delete; // Noncopyable + GpuFsaMemoryManager& operator=(const GpuFsaMemoryManager&) = delete; // Noncopyable + + std::forward_list m_Pools; + std::vector m_FreePools; +}; + +} diff --git a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp new file mode 100644 index 0000000000..9efb300576 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp @@ -0,0 +1,21 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaBackend.hpp" +#include + +namespace +{ +using namespace armnn; +static BackendRegistry::StaticRegistryInitializer g_RegisterHelper +{ + BackendRegistryInstance(), + GpuFsaBackend::GetIdStatic(), + []() + { + return IBackendInternalUniquePtr(new GpuFsaBackend); + } +}; +} // Anonymous namespace \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp new file mode 100644 index 0000000000..249b915ce1 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp @@ -0,0 +1,188 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#include "GpuFsaTensorHandle.hpp" + +namespace armnn +{ +GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo, + std::shared_ptr& memoryManager) + : m_TensorInfo(tensorInfo) + , m_MemoryManager(memoryManager) + , m_Pool(nullptr) + , m_UnmanagedMemory(nullptr) + , m_ImportFlags(static_cast(MemorySource::Undefined)) + , m_Imported(false) + , m_IsImportEnabled(false) +{} + +GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo, + MemorySourceFlags importFlags) + : m_TensorInfo(tensorInfo) + , m_Pool(nullptr) + , m_UnmanagedMemory(nullptr) + , m_ImportFlags(importFlags) + , m_Imported(false) + , m_IsImportEnabled(true) +{} + +GpuFsaTensorHandle::~GpuFsaTensorHandle() +{ + if (!m_Pool) + { + // unmanaged + if (!m_Imported) + { + ::operator delete(m_UnmanagedMemory); + } + } +} + +void GpuFsaTensorHandle::Manage() +{ + if (!m_IsImportEnabled) + { + if (m_Pool == nullptr) + { + throw MemoryValidationException("GpuFsaTensorHandle::Manage() called twice"); + } + if (m_UnmanagedMemory == nullptr) + { + throw MemoryValidationException("GpuFsaTensorHandle::Manage() called after Allocate()"); + } + + m_Pool = m_MemoryManager->Manage(m_TensorInfo.GetNumBytes()); + } +} + +void GpuFsaTensorHandle::Allocate() +{ + // If import is enabled, do not allocate the tensor + if (!m_IsImportEnabled) + { + + if (!m_UnmanagedMemory) + { + if (!m_Pool) + { + // unmanaged + m_UnmanagedMemory = ::operator new(m_TensorInfo.GetNumBytes()); + } + else + { + m_MemoryManager->Allocate(m_Pool); + } + } + else + { + throw InvalidArgumentException("GpuFsaTensorHandle::Allocate Trying to allocate a GpuFsaTensorHandle" + "that already has allocated memory."); + } + } +} + +const void* GpuFsaTensorHandle::Map(bool /*unused*/) const +{ + return GetPointer(); +} + +void* GpuFsaTensorHandle::GetPointer() const +{ + if (m_UnmanagedMemory) + { + return m_UnmanagedMemory; + } + else if (m_Pool) + { + return m_MemoryManager->GetPointer(m_Pool); + } + else + { + throw NullPointerException("GpuFsaTensorHandle::GetPointer called on unmanaged, unallocated tensor handle"); + } +} + +void GpuFsaTensorHandle::CopyOutTo(void* dest) const +{ + const void *src = GetPointer(); + if (src == nullptr) + { + throw MemoryValidationException("GpuFsaTensorhandle: CopyOutTo: Invalid memory src pointer"); + } + memcpy(dest, src, m_TensorInfo.GetNumBytes()); +} + +void GpuFsaTensorHandle::CopyInFrom(const void* src) +{ + void *dest = GetPointer(); + if (dest == nullptr) + { + throw MemoryValidationException("GpuFsaTensorhandle: CopyInFrom: Invalid memory dest pointer"); + } + memcpy(dest, src, m_TensorInfo.GetNumBytes()); +} + +bool GpuFsaTensorHandle::Import(void* memory, MemorySource source) +{ + if (m_ImportFlags & static_cast(source)) + { + if (m_IsImportEnabled && source == MemorySource::Malloc) + { + // Check memory alignment + if(!CanBeImported(memory, source)) + { + if (m_Imported) + { + m_Imported = false; + m_UnmanagedMemory = nullptr; + } + return false; + } + + // m_UnmanagedMemory not yet allocated. + if (!m_Imported && !m_UnmanagedMemory) + { + m_UnmanagedMemory = memory; + m_Imported = true; + return true; + } + + // m_UnmanagedMemory initially allocated with Allocate(). + if (!m_Imported && m_UnmanagedMemory) + { + return false; + } + + // m_UnmanagedMemory previously imported. + if (m_Imported) + { + m_UnmanagedMemory = memory; + return true; + } + } + } + + return false; +} + +bool GpuFsaTensorHandle::CanBeImported(void* memory, MemorySource source) +{ + if (m_ImportFlags & static_cast(source)) + { + if (m_IsImportEnabled && source == MemorySource::Malloc) + { + uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType()); + if (reinterpret_cast(memory) % alignment) + { + return false; + } + return true; + } + } + return false; +} + + + +} \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp new file mode 100644 index 0000000000..d6901d1225 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp @@ -0,0 +1,361 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace armnn +{ + +class GpuFsaTensorHandle : public IClTensorHandle +{ +public: + GpuFsaTensorHandle(const TensorInfo& tensorInfo) + : m_ImportFlags(static_cast(MemorySource::Undefined)), + m_Imported(false), + m_IsImportEnabled(false) + { + armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo); + } + + GpuFsaTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + MemorySourceFlags importFlags = static_cast(MemorySource::Undefined)) + : m_ImportFlags(importFlags), + m_Imported(false), + m_IsImportEnabled(false) + { + armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout); + } + + arm_compute::CLTensor& GetTensor() override { return m_Tensor; } + arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; } + virtual void Allocate() override + { + // If we have enabled Importing, don't allocate the tensor + if (m_IsImportEnabled) + { + throw MemoryImportException("GpuFsaTensorHandle::Attempting to allocate memory when importing"); + } + else + { + armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor); + } + + } + + virtual void Manage() override + { + // If we have enabled Importing, don't manage the tensor + if (m_IsImportEnabled) + { + throw MemoryImportException("GpuFsaTensorHandle::Attempting to manage memory when importing"); + } + else + { + assert(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } + } + + virtual const void* Map(bool blocking = true) const override + { + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle* GetParent() const override { return nullptr; } + + virtual arm_compute::DataType GetDataType() const override + { + return m_Tensor.info()->data_type(); + } + + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) override + { + m_MemoryGroup = PolymorphicPointerDowncast(memoryGroup); + } + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + + void SetImportFlags(MemorySourceFlags importFlags) + { + m_ImportFlags = importFlags; + } + + MemorySourceFlags GetImportFlags() const override + { + return m_ImportFlags; + } + + void SetImportEnabledFlag(bool importEnabledFlag) + { + m_IsImportEnabled = importEnabledFlag; + } + + virtual bool Import(void* /*memory*/, MemorySource source) override + { + if (m_ImportFlags & static_cast(source)) + { + throw MemoryImportException("GpuFsaTensorHandle::Incorrect import flag"); + } + m_Imported = false; + return false; + } + + virtual bool CanBeImported(void* /*memory*/, MemorySource /*source*/) override + { + // This TensorHandle can never import. + return false; + } + +private: + // Only used for testing + void CopyOutTo(void* memory) const override + { + const_cast(this)->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + const_cast(this)->Unmap(); + } + + // Only used for testing + void CopyInFrom(const void* memory) override + { + this->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + this->Unmap(); + } + + arm_compute::CLTensor m_Tensor; + std::shared_ptr m_MemoryGroup; + MemorySourceFlags m_ImportFlags; + bool m_Imported; + bool m_IsImportEnabled; +}; + +class GpuFsaSubTensorHandle : public IClTensorHandle +{ +public: + GpuFsaSubTensorHandle(IClTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) + { + parentHandle = parent; + } + + arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; } + arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; } + + virtual void Allocate() override {} + virtual void Manage() override {} + + virtual const void* Map(bool blocking = true) const override + { + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle* GetParent() const override { return parentHandle; } + + virtual arm_compute::DataType GetDataType() const override + { + return m_Tensor.info()->data_type(); + } + + virtual void SetMemoryGroup(const std::shared_ptr&) override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + +private: + // Only used for testing + void CopyOutTo(void* memory) const override + { + const_cast(this)->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + const_cast(this)->Unmap(); + } + + // Only used for testing + void CopyInFrom(const void* memory) override + { + this->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + this->Unmap(); + } + + mutable arm_compute::CLSubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp new file mode 100644 index 0000000000..c1a34d24e5 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp @@ -0,0 +1,112 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaTensorHandle.hpp" +#include "GpuFsaTensorHandleFactory.hpp" + +namespace armnn +{ + +using FactoryId = ITensorHandleFactory::FactoryId; + +std::unique_ptr GpuFsaTensorHandleFactory::CreateSubTensorHandle(ITensorHandle& parent, + const TensorShape& subTensorShape, + const unsigned int* subTensorOrigin) const +{ + arm_compute::Coordinates coords; + arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape); + + coords.set_num_dimensions(subTensorShape.GetNumDimensions()); + for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); ++i) + { + // Arm compute indexes tensor coords in reverse order. + unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; + coords.set(i, armnn::numeric_cast(subTensorOrigin[revertedIndex])); + } + + const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.GetShape()); + + // In order for ACL to support subtensors the concat axis cannot be on x or y and the values of x and y + // must match the parent shapes + if (coords.x() != 0 || coords.y() != 0) + { + return nullptr; + } + if ((parentShape.x() != shape.x()) || (parentShape.y() != shape.y())) + { + return nullptr; + } + + if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape)) + { + return nullptr; + } + + return std::make_unique(PolymorphicDowncast(&parent), shape, coords); +} + +std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const +{ + return GpuFsaTensorHandleFactory::CreateTensorHandle(tensorInfo, true); +} + +std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout) const +{ + return GpuFsaTensorHandleFactory::CreateTensorHandle(tensorInfo, dataLayout, true); +} + +std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo, + const bool IsMemoryManaged) const +{ + std::unique_ptr tensorHandle = std::make_unique(tensorInfo); + if (!IsMemoryManaged) + { + ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed."; + } + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + return tensorHandle; +} + +std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + const bool IsMemoryManaged) const +{ + std::unique_ptr tensorHandle = std::make_unique(tensorInfo, dataLayout); + if (!IsMemoryManaged) + { + ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed."; + } + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + return tensorHandle; +} + +const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic() +{ + static const FactoryId s_Id(GpuFsaTensorHandleFactoryId()); + return s_Id; +} + +const FactoryId& GpuFsaTensorHandleFactory::GetId() const +{ + return GetIdStatic(); +} + +bool GpuFsaTensorHandleFactory::SupportsSubTensors() const +{ + return true; +} + +MemorySourceFlags GpuFsaTensorHandleFactory::GetExportFlags() const +{ + return MemorySourceFlags(MemorySource::Undefined); +} + +MemorySourceFlags GpuFsaTensorHandleFactory::GetImportFlags() const +{ + return MemorySourceFlags(MemorySource::Undefined); +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp new file mode 100644 index 0000000000..93a44259f6 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +#include + +namespace armnn +{ + +constexpr const char * GpuFsaTensorHandleFactoryId() { return "Arm/GpuFsa/TensorHandleFactory"; } + +class GpuFsaTensorHandleFactory : public ITensorHandleFactory +{ + +public: + GpuFsaTensorHandleFactory(std::shared_ptr mgr) + : m_MemoryManager(mgr) + {} + + std::unique_ptr CreateSubTensorHandle(ITensorHandle& parent, + TensorShape const& subTensorShape, + unsigned int const* subTensorOrigin) const override; + + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo) const override; + + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout) const override; + + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo, + const bool IsMemoryManaged) const override; + + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + const bool IsMemoryManaged) const override; + + static const FactoryId& GetIdStatic(); + + const FactoryId& GetId() const override; + + bool SupportsSubTensors() const override; + + MemorySourceFlags GetExportFlags() const override; + + MemorySourceFlags GetImportFlags() const override; + +private: + mutable std::shared_ptr m_MemoryManager; + +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp new file mode 100644 index 0000000000..6d13879f51 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp @@ -0,0 +1,91 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include + +#include "GpuFsaWorkloadFactory.hpp" +#include "GpuFsaBackendId.hpp" +#include "GpuFsaTensorHandle.hpp" + +namespace armnn +{ + +namespace +{ +static const BackendId s_Id{GpuFsaBackendId()}; +} +template +std::unique_ptr GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& /*descriptor*/, + const WorkloadInfo& /*info*/) const +{ + return nullptr; +} + +template +bool IsDataType(const WorkloadInfo& info) +{ + auto checkType = [](const TensorInfo& tensorInfo) {return tensorInfo.GetDataType() == ArmnnType;}; + auto it = std::find_if(std::begin(info.m_InputTensorInfos), std::end(info.m_InputTensorInfos), checkType); + if (it != std::end(info.m_InputTensorInfos)) + { + return true; + } + it = std::find_if(std::begin(info.m_OutputTensorInfos), std::end(info.m_OutputTensorInfos), checkType); + if (it != std::end(info.m_OutputTensorInfos)) + { + return true; + } + return false; +} + +GpuFsaWorkloadFactory::GpuFsaWorkloadFactory(const std::shared_ptr& memoryManager) + : m_MemoryManager(memoryManager) +{ +} + +GpuFsaWorkloadFactory::GpuFsaWorkloadFactory() + : m_MemoryManager(new GpuFsaMemoryManager()) +{ +} + +const BackendId& GpuFsaWorkloadFactory::GetBackendId() const +{ + return s_Id; +} + +bool GpuFsaWorkloadFactory::IsLayerSupported(const Layer& layer, + Optional dataType, + std::string& outReasonIfUnsupported) +{ + return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported); +} + +std::unique_ptr GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, + const bool /*isMemoryManaged*/) const +{ + std::unique_ptr tensorHandle = std::make_unique(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + + return tensorHandle; +} + +std::unique_ptr GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + const bool /*isMemoryManaged*/) const +{ + std::unique_ptr tensorHandle = std::make_unique(tensorInfo, dataLayout); + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + + return tensorHandle; +} + +std::unique_ptr GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/, + const QueueDescriptor& /*descriptor*/, + const WorkloadInfo& /*info*/) const +{ + return nullptr; +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp new file mode 100644 index 0000000000..9b97070766 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp @@ -0,0 +1,59 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +#include + +namespace armnn +{ + +// Dynamic Fusion workload factory. +class GpuFsaWorkloadFactory : public IWorkloadFactory +{ +public: + explicit GpuFsaWorkloadFactory(const std::shared_ptr& memoryManager); + GpuFsaWorkloadFactory(); + + ~GpuFsaWorkloadFactory() {} + + const BackendId& GetBackendId() const override; + + static bool IsLayerSupported(const Layer& layer, + Optional dataType, + std::string& outReasonIfUnsupported); + + bool SupportsSubTensors() const override { return false; } + + ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateSubTensorHandle instead") + std::unique_ptr CreateSubTensorHandle(ITensorHandle& /*parent*/, + TensorShape const& /*subTensorShape*/, + unsigned int const* /*subTensorOrigin*/) const override + { + return nullptr; + } + + ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateTensorHandle instead") + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo, + const bool IsMemoryManaged = true) const override; + + ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateTensorHandle instead") + std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + const bool IsMemoryManaged = true) const override; + + std::unique_ptr CreateWorkload(LayerType type, + const QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + +private: + template + std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const; + + mutable std::shared_ptr m_MemoryManager; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/backend.cmake b/src/backends/gpuFsa/backend.cmake new file mode 100644 index 0000000000..16473336e0 --- /dev/null +++ b/src/backends/gpuFsa/backend.cmake @@ -0,0 +1,15 @@ +# +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/gpuFsa) +list(APPEND armnnLibraries armnnGpuFsaBackend) + +if(ARMCOMPUTEGPUFSA) + list(APPEND armnnLibraries armnnGpuFsaBackendLayerValidators) + list(APPEND armnnLibraries armnnGpuFsaBackendWorkloads) + list(APPEND armnnUnitTestLibraries armnnGpuFsaBackendUnitTests) +else() + message(STATUS "GPU Dynamic Fusion backend is disabled") +endif() diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk new file mode 100644 index 0000000000..d8d254205b --- /dev/null +++ b/src/backends/gpuFsa/backend.mk @@ -0,0 +1,58 @@ +# +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +# BACKEND_SOURCES contains the list of files to be included +# in the Android build and it is picked up by the Android.mk +# file in the root of ArmNN + +# The variable to enable/disable the GPU Dynamic Fusion backend +# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk) +ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1) + +# ARMNN_COMPUTE_GPUFSA_ENABLED == 1 +# Include the source files for the GPU Dynamic Fusion backend + +BACKEND_SOURCES := \ + GpuFsaBackend.cpp \ + GpuFsaBackendContext.cpp \ + GpuFsaContextControl.cpp \ + GpuFsaLayerSupport.cpp \ + GpuFsaRegistryInitializer.cpp \ + GpuFsaTensorHandleFactory.cpp \ + GpuFsaWorkloadFactory.cpp \ + layerValidators/GpuFsaConvolution2dValidate.cpp +else + +# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 +# No source file will be compiled for the GPU Dynamic Fusion backend + +BACKEND_SOURCES := + +endif + +# BACKEND_TEST_SOURCES contains the list of files to be included +# in the Android unit test build (armnn-tests) and it is picked +# up by the Android.mk file in the root of ArmNN + +# The variable to enable/disable the GPU Dynamic Fusion backend +# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk) +ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1) + +# ARMNN_COMPUTE_GPUFSA_ENABLED == 1 +# Include the source files for the GPU Dynamic Fusion backend tests + +BACKEND_TEST_SOURCES := \ + test/GpuFsaEndToEndTests.cpp \ + test/GpuFsaLayerSupportTests.cpp \ + test/GpuFsaLayerTests.cpp \ + test/GpuFsaOptimizedNetworkTests.cpp +else + +# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 +# No source file will be compiled for the GPU Dynamic Fusion backend tests + +BACKEND_TEST_SOURCES := + +endif diff --git a/src/backends/gpuFsa/layerValidators/CMakeLists.txt b/src/backends/gpuFsa/layerValidators/CMakeLists.txt new file mode 100644 index 0000000000..57ea41d56c --- /dev/null +++ b/src/backends/gpuFsa/layerValidators/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +list(APPEND armnnGpuFsaBackendLayerValidators_sources + GpuFsaConvolution2dValidate.cpp + GpuFsaConvolution2dValidate.hpp + ) + +add_library(armnnGpuFsaBackendLayerValidators OBJECT ${armnnGpuFsaBackendLayerValidators_sources}) +target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn) +target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) +target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/backends) diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp new file mode 100644 index 0000000000..bed7b26f74 --- /dev/null +++ b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp @@ -0,0 +1,126 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaConvolution2dValidate.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace armnn +{ + +using namespace armcomputetensorutils; + +inline arm_compute::Status ValidateAndCreateOp(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases, + const bool createOp = false) +{ + // Create a new workload sketch, for validation purposes + auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context(); + auto gpuCtx = GpuWorkloadContext(&compileCtx); + GpuWorkloadSketch sketch{ &gpuCtx }; + + // Build and create tensor infos using the sketch + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); + arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); + aclWeightsInfo.set_are_values_constant(weights.IsConstant()); + + auto inputInfo = gpuCtx.create_tensor_info(aclInputInfo); + auto weightInfo = gpuCtx.create_tensor_info(aclWeightsInfo); + + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + arm_compute::TensorInfo aclBiasInfo; + arm_compute::TensorInfo biasSketchInfo; + arm_compute::TensorInfo* biasSketchInfoPtr = nullptr; + + if (descriptor.m_BiasEnabled) + { + if(!biases.has_value()) + { + throw InvalidArgumentException("GpuFsaConvolution2dValidate: No biases set when biases are enabled"); + } + aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); + aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); + + biasSketchInfo = gpuCtx.create_tensor_info(aclBiasInfo); + biasSketchInfoPtr = &biasSketchInfo; + } + + // Set Conv2d attributes using descriptor + const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX, + descriptor.m_DilationY); + const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor); + const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY); + + Conv2dAttributes conv2DAttributes{}; + conv2DAttributes.dilation(aclDilationInfo); + conv2DAttributes.pad(aclPadInfo); + conv2DAttributes.stride(aclStrideInfo); + + // Validate operator, check status and update reasonIfUnsupported + arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch, + &inputInfo, + &weightInfo, + biasSketchInfoPtr, + conv2DAttributes); + + if (createOp) + { + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported) + { + throw BackendCapabilityException("\"GpuFsa\" backend failed during operation validation when attempting " + "to fuse a GpuConv2d operator into the existing workload sketch."); + } + + arm_compute::ITensorInfo* convOutInfo = GpuConv2d::create_op(sketch, + &inputInfo, + &weightInfo, + biasSketchInfoPtr, + conv2DAttributes); + + // Temporary fix until fusing attempt is make for GpuFsa backend and Output layer workload is created. + auto outputInfo = gpuCtx.create_tensor_info(); + GpuOutput::create_op(sketch, convOutInfo, &outputInfo); + } + + return aclStatus; +} + +arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) +{ + return ValidateAndCreateOp(input, descriptor, weights, biases); +} + +void GpuFsaConvolution2dCreateOp(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases) +{ + ValidateAndCreateOp(input, descriptor, weights, biases, true); +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp new file mode 100644 index 0000000000..120060e8ad --- /dev/null +++ b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +#include +#include + +namespace armnn +{ + +using namespace arm_compute::experimental::dynamic_fusion; + +arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases); + +void GpuFsaConvolution2dCreateOp(const TensorInfo& input, + const Convolution2dDescriptor& descriptor, + const TensorInfo& weights, + const Optional& biases); + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/test/CMakeLists.txt b/src/backends/gpuFsa/test/CMakeLists.txt new file mode 100644 index 0000000000..66091e90df --- /dev/null +++ b/src/backends/gpuFsa/test/CMakeLists.txt @@ -0,0 +1,19 @@ +# +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +list(APPEND armnnGpuFsaBackendUnitTests_sources + GpuFsaDefaultAllocatorTests.cpp + GpuFsaEndToEndTests.cpp + GpuFsaLayerTests.cpp + GpuFsaLayerSupportTests.cpp + GpuFsaOptimizedNetworkTests.cpp +) + +add_library(armnnGpuFsaBackendUnitTests OBJECT ${armnnGpuFsaBackendUnitTests_sources}) +target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn) +target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) +target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnTestUtils) +target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/backends) +target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/third-party/doctest) diff --git a/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp new file mode 100644 index 0000000000..17d5952217 --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp @@ -0,0 +1,193 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include +#include +#include +// Requires the OpenCl backend to be included (GpuFsa) +#include +#include +#include +#include +#include + +using namespace armnn; + +namespace +{ + +TEST_SUITE("DefaultAllocatorTests") +{ + +TEST_CASE("DefaultAllocatorTest") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("DefaultAllocatorTestMulti") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto* inputPtr2 = reinterpret_cast(alignedInputPtr2); + std::fill_n(inputPtr2, numElements, number); + CHECK(inputPtr2[0] == 3); + CHECK(inputPtr2[1] == 3); + + // No overlap + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("DefaultAllocatorTestMock") +{ + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + IRuntimePtr run = IRuntime::Create(options); + + // Initialize Mock Backend + MockBackendInitialiser initialiser; + auto factoryFun = BackendRegistryInstance().GetFactory(MockBackend().GetIdStatic()); + CHECK(factoryFun != nullptr); + auto backend = factoryFun(); + auto defaultAllocator = backend->GetDefaultAllocator(); + + // GetMemorySourceType + CHECK(defaultAllocator->GetMemorySourceType() == MemorySource::Malloc); + + size_t totalBytes = 1 * sizeof(float); + // Allocate + void* ptr = defaultAllocator->allocate(totalBytes, 0); + + // GetMemoryRegionAtOffset + CHECK(defaultAllocator->GetMemoryRegionAtOffset(ptr, 0, 0)); + + // Free + defaultAllocator->free(ptr); + + // Clean up + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.Deregister(MockBackend().GetIdStatic()); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +} + + +TEST_SUITE("GpuFsaDefaultAllocatorTests") +{ + +TEST_CASE("GpuFsaDefaultAllocatorTest") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("GpuFsaDefaultAllocatorTestMulti") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto* inputPtr2 = reinterpret_cast(alignedInputPtr2); + std::fill_n(inputPtr2, numElements, number); + CHECK(inputPtr2[0] == 3); + CHECK(inputPtr2[1] == 3); + + // No overlap + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp new file mode 100644 index 0000000000..1d6b99a31f --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp @@ -0,0 +1,8 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "backendsCommon/test/EndToEndTestImpl.hpp" + +#include \ No newline at end of file diff --git a/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp b/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp new file mode 100644 index 0000000000..f162df0b55 --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include + +#include + +#include + +using namespace armnn; + +TEST_SUITE("GpuFsaLayerSupport") +{ + +TEST_CASE("IsLayerSupportedGpuFsaConv2d") +{ + TensorInfo inputInfo ({ 1, 5, 5, 1 }, DataType::Float32); + TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32); + TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true); + TensorInfo biasesInfo ({ 1 }, DataType::Float32, 0.0f, 0, true); + + Convolution2dDescriptor desc; + desc.m_BiasEnabled = true; + desc.m_DataLayout = DataLayout::NHWC; + + GpuFsaLayerSupport supportChecker; + std::string reasonIfNotSupported; + auto supported = supportChecker.IsLayerSupported(LayerType::Convolution2d, + {inputInfo, outputInfo, weightsInfo, biasesInfo}, + desc, + EmptyOptional(), + EmptyOptional(), + reasonIfNotSupported); + CHECK(supported); +} + +TEST_CASE("IsLayerSupportedGpuFsaConv2dUnsupported") +{ + TensorInfo inputInfo ({ 1, 5, 5, 1 }, DataType::Float32); + TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32); + TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true); + + // NCHW is unsupported. + Convolution2dDescriptor desc; + desc.m_DataLayout = DataLayout::NCHW; + + GpuFsaLayerSupport supportChecker; + std::string reasonIfNotSupported; + auto supported = supportChecker.IsLayerSupported(LayerType::Convolution2d, + {inputInfo, outputInfo, weightsInfo, TensorInfo()}, + desc, + EmptyOptional(), + EmptyOptional(), + reasonIfNotSupported); + CHECK(!supported); + REQUIRE(reasonIfNotSupported.find("NCHW not supported by this kernel") != std::string::npos); +} + +} \ No newline at end of file diff --git a/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp b/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp new file mode 100644 index 0000000000..e032922d17 --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp @@ -0,0 +1,12 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaWorkloadFactoryHelper.hpp" + +#include + +#include + +#include \ No newline at end of file diff --git a/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp b/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp new file mode 100644 index 0000000000..7e094cec1e --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp @@ -0,0 +1,137 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include + +#include +#include + +#include + +using namespace armnn; + +TEST_SUITE("GpuFsaOptimizedNetwork") +{ + +TEST_CASE("SingleConv2dSupportedOptimizedNetwork") +{ + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + INetworkPtr network(INetwork::Create()); + + TensorInfo inputInfo({ 1, 5, 5, 1 }, DataType::Float32); + TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32); + TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true); + TensorInfo biasesInfo({ 1 }, DataType::Float32, 0.0f, 0, true); + + Convolution2dDescriptor desc; + desc.m_BiasEnabled = true; + desc.m_DataLayout = DataLayout::NHWC; + + auto inputLayer = network->AddInputLayer(0, "input"); + auto weightLayer = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights"); + auto biasLayer = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias"); + auto convLayer = network->AddConvolution2dLayer(desc, "conv2d"); + auto outputLayer = network->AddOutputLayer(1, "output"); + + inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + + weightLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1)); + weightLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo); + + biasLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(2)); + biasLayer->GetOutputSlot(0).SetTensorInfo(biasesInfo); + + convLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + + std::vector backends = { "GpuFsa" }; + + OptimizerOptionsOpaque optimizedOptions; + IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optimizedOptions); + CHECK(optNet); + + Graph& graph = GetGraphForTesting(optNet.get()); + + // Check graph layer sequence to ensure that the network has been replaced with a PreCompiledLayer + CHECK(CheckSequence(graph.cbegin(), graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); +} + +TEST_CASE("TwoConv2dSupportedOptimizedNetwork") +{ + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + INetworkPtr network(INetwork::Create()); + + TensorInfo inputInfo({ 1, 5, 5, 1 }, DataType::Float32); + TensorInfo intermediateInfo({ 1, 3, 3, 1 }, DataType::Float32); + TensorInfo outputInfo({ 1, 1, 1, 1 }, DataType::Float32); + TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true); + TensorInfo biasesInfo({ 1 }, DataType::Float32, 0.0f, 0, true); + + Convolution2dDescriptor desc; + desc.m_BiasEnabled = true; + desc.m_DataLayout = DataLayout::NHWC; + + auto inputLayer = network->AddInputLayer(0, "input"); + + auto weightLayer1 = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights"); + auto biasLayer1 = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias"); + auto convLayer1 = network->AddConvolution2dLayer(desc, "conv2d"); + + auto weightLayer2 = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights"); + auto biasLayer2 = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias"); + auto convLayer2 = network->AddConvolution2dLayer(desc, "conv2d"); + + auto outputLayer = network->AddOutputLayer(0, "output"); + + inputLayer->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(0)); + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + + weightLayer1->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(1)); + weightLayer1->GetOutputSlot(0).SetTensorInfo(weightsInfo); + + biasLayer1->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(2)); + biasLayer1->GetOutputSlot(0).SetTensorInfo(biasesInfo); + + convLayer1->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(0)); + convLayer1->GetOutputSlot(0).SetTensorInfo(intermediateInfo); + + weightLayer2->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(1)); + weightLayer2->GetOutputSlot(0).SetTensorInfo(weightsInfo); + + biasLayer2->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(2)); + biasLayer2->GetOutputSlot(0).SetTensorInfo(biasesInfo); + + convLayer2->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + convLayer2->GetOutputSlot(0).SetTensorInfo(outputInfo); + + std::vector backends = { "GpuFsa" }; + + OptimizerOptionsOpaque optimizedOptions; + IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optimizedOptions); + CHECK(optNet); + + Graph& graph = GetGraphForTesting(optNet.get()); + + // Check graph layer sequence to ensure that the network has been replaced with a PreCompiledLayer + CHECK(CheckSequence(graph.cbegin(), graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); +} + +} \ No newline at end of file diff --git a/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp b/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp new file mode 100644 index 0000000000..c1d75d625b --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp @@ -0,0 +1,45 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include + +#include + +#include +#include +#include "gpuFsa/GpuFsaTensorHandleFactory.hpp" + +namespace +{ + +template<> +struct WorkloadFactoryHelper +{ + static armnn::IBackendInternal::IMemoryManagerSharedPtr GetMemoryManager() + { + armnn::GpuFsaBackend backend; + return backend.CreateMemoryManager(); + } + + static armnn::GpuFsaWorkloadFactory GetFactory( + const armnn::IBackendInternal::IMemoryManagerSharedPtr&) + { + return armnn::GpuFsaWorkloadFactory(); + } + + static armnn::GpuFsaTensorHandleFactory GetTensorHandleFactory( + const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) + { + + return armnn::GpuFsaTensorHandleFactory( + armnn::PolymorphicPointerDowncast(memoryManager)); + } +}; + +using GpuFsaWorkloadFactoryHelper = WorkloadFactoryHelper; + +} // anonymous namespace diff --git a/src/backends/gpuFsa/workloads/CMakeLists.txt b/src/backends/gpuFsa/workloads/CMakeLists.txt new file mode 100644 index 0000000000..4d100123ea --- /dev/null +++ b/src/backends/gpuFsa/workloads/CMakeLists.txt @@ -0,0 +1,16 @@ +# +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# + +list(APPEND armnnGpuFsaBackendWorkloads_sources + GpuFsaBaseWorkload.hpp +) + +add_library(armnnGpuFsaBackendWorkloads OBJECT ${armnnGpuFsaBackendWorkloads_sources}) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/backends) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/profiling) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/profiling/common/include) +target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/profiling/client/include) diff --git a/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp new file mode 100644 index 0000000000..c274e14665 --- /dev/null +++ b/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp @@ -0,0 +1,39 @@ +// +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include + +namespace armnn +{ + +template +class GpuFsaBaseWorkload : public BaseWorkload +{ +public: + GpuFsaBaseWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info) + : BaseWorkload(descriptor, info) + {} + + virtual bool SupportsTensorHandleReplacement() const override + { + return true; + } + + // Replace input tensor handle with the given TensorHandle + void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override + { + this->m_Data.m_Inputs[slot] = tensorHandle; + } + + // Replace output tensor handle with the given TensorHandle + void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override + { + this->m_Data.m_Outputs[slot] = tensorHandle; + } +}; + +} //namespace armnn \ No newline at end of file -- cgit v1.2.1