// // Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "GpuFsaBackend.hpp" #include "GpuFsaBackendContext.hpp" #include "GpuFsaBackendDefaultAllocator.hpp" #include "GpuFsaBackendId.hpp" #include "GpuFsaLayerSupport.hpp" #include "GpuFsaTensorHandleFactory.hpp" #include "GpuFsaWorkloadFactory.hpp" #include #include #include #include #include #include #include #include "layers/GpuFsaCast.hpp" #include "layers/GpuFsaConvolution2d.hpp" #include "layers/GpuFsaDepthwiseConvolution2d.hpp" #include "layers/GpuFsaElementwiseBinary.hpp" #include "layers/GpuFsaPooling2d.hpp" #include "layers/GpuFsaResize.hpp" namespace armnn { template inline void DeleteAsType(const void* const blob) { delete static_cast(blob); } inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer) { SubgraphView::InputSlots result; for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it) { result.push_back(&(*it)); } return result; } inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer) { SubgraphView::OutputSlots result; for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it) { result.push_back(&(*it)); } return result; } inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs, SubgraphView::OutputSlots&& outputs, SubgraphView::Layers&& layers) { return std::make_unique(std::move(inputs), std::move(outputs), std::move(layers)); } const BackendId& GpuFsaBackend::GetIdStatic() { static const BackendId s_Id{GpuFsaBackendId()}; return s_Id; } IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const { if (m_UsingCustomAllocator) { return std::make_unique(m_CustomAllocator); } return std::make_unique(std::make_unique()); } IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const { return std::make_unique(PolymorphicPointerDowncast(memoryManager)); } IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry) const { std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); return std::make_unique(PolymorphicPointerDowncast(memoryManager)); } IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry, const ModelOptions&, MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc if (inputFlags == static_cast(MemorySource::Undefined)) { inputFlags = static_cast(MemorySource::Malloc); } if (outputFlags == static_cast(MemorySource::Undefined)) { outputFlags = static_cast(MemorySource::Malloc); } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); return std::make_unique(PolymorphicPointerDowncast(memoryManager)); } std::vector GpuFsaBackend::GetHandleFactoryPreferences() const { return std::vector { GpuFsaTensorHandleFactory::GetIdStatic() }; } void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) { std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); } void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc if (inputFlags == static_cast(MemorySource::Undefined)) { inputFlags = static_cast(MemorySource::Malloc); } if (outputFlags == static_cast(MemorySource::Undefined)) { outputFlags = static_cast(MemorySource::Malloc); } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); } IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const { return IBackendContextPtr{new GpuFsaBackendContext{options}}; } IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext( const IRuntime::CreationOptions&, IBackendProfilingPtr&) { return IBackendProfilingContextPtr{}; } IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const { static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport}; return layerSupport; } std::unique_ptr GpuFsaBackend::GetDefaultAllocator() const { return std::make_unique(); } OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph, const ModelOptions& modelOptions) const { OptimizationViews optimizationViews(modelOptions); using namespace arm_compute::experimental::dynamic_fusion; auto it = subgraph.end(); std::map untouched; while (it != subgraph.begin()) { --it; Layer& base = *(PolymorphicDowncast(*it)); untouched.insert({base.GetGuid(), &base}); } GpuFsaLayerSupport supportChecker; it = subgraph.end(); arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context()); // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos std::shared_ptr workloadContext = std::make_shared(compileCtx); while (it != subgraph.begin()) { --it; Layer& base = *(PolymorphicDowncast(*it)); // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob(); preCompiledBlobPtr->workloadContext = workloadContext; preCompiledBlobPtr->sketch = std::make_unique(workloadContext.get()); // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer switch (base.GetType()) { case (LayerType::Cast): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto output = base.GetOutputSlot(0).GetTensorInfo(); GpuFsaCastCreateOp(preCompiledBlobPtr, input, output); break; } case (LayerType::Convolution2d): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); auto desc = PolymorphicDowncast(&base.GetParameters()); if (desc->m_BiasEnabled) { auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo(); GpuFsaConvolution2dCreateOp(preCompiledBlobPtr, input, *desc, weights, bias); } else { GpuFsaConvolution2dCreateOp(preCompiledBlobPtr, input, *desc, weights, EmptyOptional()); } break; } case (LayerType::DepthwiseConvolution2d): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); auto desc = PolymorphicDowncast(&base.GetParameters()); if (desc->m_BiasEnabled) { auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo(); GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr, input, *desc, weights, bias); } else { GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr, input, *desc, weights, EmptyOptional()); } break; } case LayerType::ElementwiseBinary: { auto desc = PolymorphicDowncast(&base.GetParameters()); auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(); GpuFsaElementwiseBinaryCreateOp(preCompiledBlobPtr, input0, input1, *desc); break; } case (LayerType::Pooling2d): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto desc = PolymorphicDowncast(&base.GetParameters()); GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc); break; } case (LayerType::Resize): { auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); auto desc = PolymorphicDowncast(&base.GetParameters()); GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc); break; } default: // unsupported layer for GpuFsa backend continue; } auto compiledBlob = std::make_unique(preCompiledBlobPtr, DeleteAsType); IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer( PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()), std::move(*compiledBlob), armnn::Optional(GetId()), "GpuFsa_Pre_Compiled_Layer"); // Copy the output tensor infos from sub-graph for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++) { preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo()); } SubgraphView::SubgraphViewPtr substituteSubgraph = CreateSubgraphViewFrom(CreateInputsFrom(&base), CreateOutputsFrom(&base), {&base}); optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) }); untouched.erase(base.GetGuid()); } if (optimizationViews.GetSubstitutions().empty()) { optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); } else { ReportUntouchedLayers(optimizationViews, untouched); } return optimizationViews; } } // namespace armnn