// // Copyright © 2017 Arm Ltd. All rights reserved. // SPDX-License-Identifier: MIT // #include "ClBackend.hpp" #include "ClBackendContext.hpp" #include "ClBackendDefaultAllocator.hpp" #include "ClBackendId.hpp" #include "ClBackendModelContext.hpp" #include "ClImportTensorHandleFactory.hpp" #include "ClLayerSupport.hpp" #include "ClTensorHandleFactory.hpp" #include "ClWorkloadFactory.hpp" #include #include #include #include #include #include #include #include #include "workloads/ClAdditionWorkload.hpp" #include "workloads/ClBatchNormalizationFloatWorkload.hpp" #include "workloads/ClConvolution2dWorkload.hpp" #include "workloads/ClDepthwiseConvolutionWorkload.hpp" #include "workloads/ClDivisionWorkload.hpp" #include "workloads/ClFullyConnectedWorkload.hpp" #include "workloads/ClMultiplicationWorkload.hpp" #include "workloads/ClReduceWorkload.hpp" #include "workloads/ClSubtractionWorkload.hpp" #include #include #include namespace armnn { const BackendId& ClBackend::GetIdStatic() { static const BackendId s_Id{ClBackendId()}; return s_Id; } IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const { if (m_UsingCustomAllocator) { return std::make_unique(m_CustomAllocator); } return std::make_unique(std::make_unique()); } IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const { return std::make_unique( PolymorphicPointerDowncast(memoryManager)); } IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( const IBackendInternal::IMemoryManagerSharedPtr& memoryManager, const ModelOptions& modelOptions) const { return std::make_unique( PolymorphicPointerDowncast(memoryManager), CreateBackendSpecificModelContext(modelOptions)); } IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry) const { std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); std::unique_ptr importFactory = std::make_unique( static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc)); registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId()); registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId()); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); registry.RegisterFactory(std::move(importFactory)); return std::make_unique( PolymorphicPointerDowncast(memoryManager)); } IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const { std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); std::unique_ptr importFactory = std::make_unique( static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc)); registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId()); registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId()); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); registry.RegisterFactory(std::move(importFactory)); return std::make_unique( PolymorphicPointerDowncast(memoryManager), CreateBackendSpecificModelContext(modelOptions)); } IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions, MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc if (inputFlags == static_cast(MemorySource::Undefined)) { inputFlags = static_cast(MemorySource::Malloc); } if (outputFlags == static_cast(MemorySource::Undefined)) { outputFlags = static_cast(MemorySource::Malloc); } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); std::unique_ptr importFactory = std::make_unique( inputFlags, outputFlags); registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId()); registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId()); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); registry.RegisterFactory(std::move(importFactory)); return std::make_unique( PolymorphicPointerDowncast(memoryManager), CreateBackendSpecificModelContext(modelOptions)); } std::vector ClBackend::GetHandleFactoryPreferences() const { return std::vector {ClTensorHandleFactory::GetIdStatic(), ClImportTensorHandleFactory::GetIdStatic()}; } void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) { std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); std::unique_ptr importFactory = std::make_unique( static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc)); registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId()); registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId()); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); registry.RegisterFactory(std::move(importFactory)); } void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc if (inputFlags == static_cast(MemorySource::Undefined)) { inputFlags = static_cast(MemorySource::Malloc); } if (outputFlags == static_cast(MemorySource::Undefined)) { outputFlags = static_cast(MemorySource::Malloc); } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { memoryManager = std::make_shared(m_CustomAllocator); } else { memoryManager = std::make_shared(std::make_unique()); } std::unique_ptr factory = std::make_unique(memoryManager); std::unique_ptr importFactory = std::make_unique( inputFlags, outputFlags); registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId()); registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId()); registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::move(factory)); registry.RegisterFactory(std::move(importFactory)); } IBackendInternal::IBackendContextPtr ClBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const { return IBackendContextPtr{new ClBackendContext{options}}; } IBackendInternal::IBackendProfilingContextPtr ClBackend::CreateBackendProfilingContext( const IRuntime::CreationOptions&, IBackendProfilingPtr&) { return IBackendProfilingContextPtr{}; } IBackendInternal::IBackendSpecificModelContextPtr ClBackend::CreateBackendSpecificModelContext( const ModelOptions& modelOptions) const { return IBackendSpecificModelContextPtr{new ClBackendModelContext{modelOptions}}; } IBackendInternal::ILayerSupportSharedPtr ClBackend::GetLayerSupport() const { static ILayerSupportSharedPtr layerSupport { new ClLayerSupport(IBackendInternal::IBackendSpecificModelContextPtr{}) }; return layerSupport; } IBackendInternal::ILayerSupportSharedPtr ClBackend::GetLayerSupport(const ModelOptions& modelOptions) const { static ILayerSupportSharedPtr layerSupport { new ClLayerSupport(CreateBackendSpecificModelContext(modelOptions)) }; return layerSupport; } std::unique_ptr ClBackend::GetDefaultAllocator() const { return std::make_unique(); } OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, const ModelOptions& modelOptions) const { OptimizationViews optimizationViews; auto it = subgraph.endIConnectable(); bool isFastMathEnabled = false; std::map untouched; while (it != subgraph.beginIConnectable()) { --it; Layer& base = *(PolymorphicDowncast(*it)); untouched.insert({base.GetGuid(), &base}); } it = subgraph.endIConnectable(); #if defined(ARMCOMPUTECL_ENABLED) IBackendInternal::IBackendSpecificModelContextPtr modelContextPtr = CreateBackendSpecificModelContext(modelOptions); if (modelContextPtr) { auto clModelOptions = dynamic_cast(modelContextPtr.get()); if (clModelOptions) { isFastMathEnabled = clModelOptions->IsFastMathEnabled(); } } #endif while (it != subgraph.beginIConnectable()) { --it; Layer& base = *(PolymorphicDowncast(*it)); // Fuse activation into previous layer if supported by backend if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division) && (base.GetAdditionalInformation() == nullptr)) { for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output) { if (output->GetNumConnections() == 1) { for (auto&& childInput : output->GetConnections()) { if ((childInput->GetOwningLayer().GetType() == LayerType::Activation) && (checkDataTypeInputandOutput(childInput->GetOwningLayer()))) { Layer& child = childInput->GetOwningLayer(); auto* activationLayer = PolymorphicDowncast(&child); const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + base.GetName(); // Get params from activation layer ActivationDescriptor activationDesc = activationLayer->GetParameters(); if (base.GetType() == LayerType::Convolution2d) { Convolution2dLayer* baseLayer = PolymorphicDowncast(&base); Optional biases; if (baseLayer->GetParameters().m_BiasEnabled) { biases = baseLayer->m_Bias->GetTensorInfo(); } arm_compute::Status status = ClConvolution2dWorkloadValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetParameters(), baseLayer->m_Weight->GetTensorInfo(), biases, isFastMathEnabled, &activationDesc); if (status) { FuseConvolution2dLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::DepthwiseConvolution2d) { DepthwiseConvolution2dLayer* baseLayer = PolymorphicDowncast(&base); Optional biases; if (baseLayer->GetParameters().m_BiasEnabled) { biases = baseLayer->m_Bias->GetTensorInfo(); } arm_compute::Status status = ClDepthwiseConvolutionWorkloadValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetParameters(), baseLayer->m_Weight->GetTensorInfo(), biases, &activationDesc); if (status) { FuseDepthwiseConvolution2dLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::FullyConnected) { FullyConnectedLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClFullyConnectedWorkloadValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->m_Weight->GetTensorInfo(), baseLayer->m_Bias->GetTensorInfo(), baseLayer->GetParameters(), &activationDesc); if (status) { FuseFullyConnectedLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::BatchNormalization) { BatchNormalizationLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClBatchNormalizationValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->m_Mean->GetTensorInfo(), baseLayer->m_Variance->GetTensorInfo(), baseLayer->m_Beta->GetTensorInfo(), baseLayer->m_Gamma->GetTensorInfo(), baseLayer->GetParameters(), &activationDesc); if (status) { BatchNormalizationLayer* replacementLayer = FuseBatchNormalizationLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); replacementLayer->m_Beta = std::move(baseLayer->m_Beta); replacementLayer->m_Gamma = std::move(baseLayer->m_Gamma); replacementLayer->m_Mean = std::move(baseLayer->m_Mean); replacementLayer->m_Variance = std::move(baseLayer->m_Variance); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::Addition) { AdditionLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClAdditionValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), &activationDesc); if (status) { FuseAdditionLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::Division) { DivisionLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClDivisionWorkloadValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), &activationDesc); if (status) { FuseDivisionLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::Multiplication) { MultiplicationLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClMultiplicationWorkloadValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), &activationDesc); if (status) { FuseMultiplicationLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } else if (base.GetType() == LayerType::Subtraction) { SubtractionLayer* baseLayer = PolymorphicDowncast(&base); arm_compute::Status status = ClSubtractionValidate( baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), &activationDesc); if (status) { FuseSubtractionLayer(optimizationViews, baseLayer, activationLayer, activationDesc, name); untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } } } } } } } // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis. if (base.GetType() == LayerType::Reduce) { ReduceLayer* baseLayer = PolymorphicDowncast(&base); ReduceDescriptor reduceDescriptor = baseLayer->GetParameters(); if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1) { // Add new layers to the graph and connect them. std::vector layers = ChainReduceLayers(optimizationViews, baseLayer, reduceDescriptor); // Replace existing baselayer with new subgraph. ReplaceLayers(optimizationViews, baseLayer, layers); untouched.erase(baseLayer->GetGuid()); } } } if (optimizationViews.GetSubstitutions().empty()) { optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); } else { ReportUntouchedLayers(optimizationViews, untouched); } return optimizationViews; } } // namespace armnn