patch/latest/_gpu_fsa_backend_8cpp_source.html

 //

+ // Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.

+ // SPDX-License-Identifier: MIT

+ //

+

+ #include "GpuFsaBackend.hpp"

+ #include "GpuFsaBackendContext.hpp"

+ #include "GpuFsaBackendDefaultAllocator.hpp"

+ #include "GpuFsaBackendId.hpp"

+ #include "GpuFsaLayerSupport.hpp"

+ #include "GpuFsaTensorHandleFactory.hpp"

+ #include "GpuFsaWorkloadFactory.hpp"

+

+ #include <armnn/backends/IBackendContext.hpp>

+ #include <armnn/backends/IMemoryManager.hpp>

+ #include <backendsCommon/SubgraphUtils.hpp>

+ #include <Optimizer.hpp>

+

+ #include <arm_compute/core/CL/CLKernelLibrary.h>

+ #include <arm_compute/runtime/CL/CLBufferAllocator.h>

+

+ #include "layers/GpuFsaActivation.hpp"

+ #include "layers/GpuFsaBatchMatMul.hpp"

+ #include "layers/GpuFsaCast.hpp"

+ #include "layers/GpuFsaConvolution2d.hpp"

+ #include "layers/GpuFsaDepthwiseConvolution2d.hpp"

+ #include "layers/GpuFsaElementwiseBinary.hpp"

+ #include "layers/GpuFsaPooling2d.hpp"

+ #include "layers/GpuFsaReshape.hpp"

+ #include "layers/GpuFsaResize.hpp"

+ #include "layers/GpuFsaSoftmax.hpp"

+

+ namespace armnn

+ {

+

+ template <typename T>

+ inline void DeleteAsType(const void* const blob)

+ {

+     delete static_cast<const T*>(blob);

+ }

+

+ inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)

+ {

+     SubgraphView::InputSlots result;

+     for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)

+     {

+         result.push_back(&(*it));

+     }

+     return result;

+ }

+

+ inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)

+ {

+     SubgraphView::OutputSlots result;

+     for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)

+     {

+         result.push_back(&(*it));

+     }

+     return result;

+ }

+

+ inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,

+                                                             SubgraphView::OutputSlots&& outputs,

+                                                             SubgraphView::Layers&& layers)

+ {

+     return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));

+ }

+

+ const BackendId& GpuFsaBackend::GetIdStatic()

+ {

+     static const BackendId s_Id{GpuFsaBackendId()};

+     return s_Id;

+ }

+

+ IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const

+ {

+     if (m_UsingCustomAllocator)

+     {

+         return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);

+     }

+     return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());

+ }

+

+ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(

+     const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const

+ {

+     return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));

+ }

+

+ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(

+     TensorHandleFactoryRegistry& registry) const

+ {

+     std::shared_ptr<GpuFsaMemoryManager> memoryManager;

+     if (m_UsingCustomAllocator)

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);

+     }

+     else

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());

+     }

+

+     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);

+

+     registry.RegisterMemoryManager(memoryManager);

+     registry.RegisterFactory(std::move(factory));

+

+     return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));

+ }

+

+ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(

+     TensorHandleFactoryRegistry& registry,

+     const ModelOptions&,

+     MemorySourceFlags inputFlags,

+     MemorySourceFlags outputFlags) const

+ {

+

+     // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc

+     if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))

+     {

+         inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);

+     }

+     if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))

+     {

+         outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);

+     }

+

+     std::shared_ptr<GpuFsaMemoryManager> memoryManager;

+     if (m_UsingCustomAllocator)

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);

+     }

+     else

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());

+     }

+

+     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);

+

+     registry.RegisterMemoryManager(memoryManager);

+     registry.RegisterFactory(std::move(factory));

+

+     return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));

+ }

+

+ std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const

+ {

+     return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };

+ }

+

+ void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)

+ {

+     std::shared_ptr<GpuFsaMemoryManager> memoryManager;

+     if (m_UsingCustomAllocator)

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);

+     }

+     else

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());

+     }

+

+     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);

+     registry.RegisterMemoryManager(memoryManager);

+     registry.RegisterFactory(std::move(factory));

+

+ }

+

+ void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,

+                                                   MemorySourceFlags inputFlags,

+                                                   MemorySourceFlags outputFlags)

+ {

+     // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc

+     if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))

+     {

+         inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);

+     }

+     if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))

+     {

+         outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);

+     }

+

+     std::shared_ptr<GpuFsaMemoryManager> memoryManager;

+     if (m_UsingCustomAllocator)

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);

+     }

+     else

+     {

+         memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());

+     }

+

+     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);

+     registry.RegisterMemoryManager(memoryManager);

+     registry.RegisterFactory(std::move(factory));

+ }

+

+ IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const

+ {

+     return IBackendContextPtr{new GpuFsaBackendContext{options}};

+ }

+

+ IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(

+     const IRuntime::CreationOptions&, IBackendProfilingPtr&)

+ {

+     return IBackendProfilingContextPtr{};

+ }

+

+ IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const

+ {

+     static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};

+     return layerSupport;

+ }

+

+ std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const

+ {

+     return std::make_unique<GpuFsaBackendDefaultAllocator>();

+ }

+

+ OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,

+                                                       const ModelOptions& modelOptions) const

+ {

+     OptimizationViews optimizationViews(modelOptions);

+

+     using namespace arm_compute::experimental::dynamic_fusion;

+

+     auto it = subgraph.end();

+     std::map<LayerGuid, Layer*> untouched;

+     while (it != subgraph.begin())

+     {

+         --it;

+         Layer& base = *(PolymorphicDowncast<Layer*>(*it));

+         untouched.insert({base.GetGuid(), &base});

+     }

+

+     GpuFsaLayerSupport supportChecker;

+     it = subgraph.end();

+     arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());

+

+     // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos

+     std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);

+     while (it != subgraph.begin())

+     {

+         --it;

+         Layer& base = *(PolymorphicDowncast<Layer*>(*it));

+         // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator

+         GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();

+         preCompiledBlobPtr->workloadContext = workloadContext;

+         preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());

+

+         // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer

+         switch (base.GetType())

+         {

+             case (LayerType::Activation):

+             {

+                 auto desc = PolymorphicDowncast<const ActivationDescriptor*>(&base.GetParameters());

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 GpuFsaActivationCreateOp(preCompiledBlobPtr, input, *desc);

+                 break;

+             }

+             case (LayerType::Cast):

+             {

+                 auto input  = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto output = base.GetOutputSlot(0).GetTensorInfo();

+                 GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);

+                 break;

+             }

+             case (LayerType::Convolution2d):

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();

+

+                 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());

+                 if (desc->m_BiasEnabled)

+                 {

+                     auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();

+                     GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,

+                                                 input,

+                                                 *desc,

+                                                 weights,

+                                                 bias);

+                 }

+                 else

+                 {

+                     GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,

+                                                 input,

+                                                 *desc,

+                                                 weights,

+                                                 EmptyOptional());

+                 }

+                 break;

+             }

+             case (LayerType::BatchMatMul):

+             {

+                 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto desc = PolymorphicDowncast<const BatchMatMulDescriptor*>(&base.GetParameters());

+                 GpuFsaBatchMatMulCreateOp(preCompiledBlobPtr, input0, input1, *desc);

+                 break;

+             }

+             case (LayerType::DepthwiseConvolution2d):

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();

+

+                 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());

+                 if (desc->m_BiasEnabled)

+                 {

+                     auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();

+                     GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,

+                                                          input,

+                                                          *desc,

+                                                          weights,

+                                                          bias);

+                 }

+                 else

+                 {

+                     GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,

+                                                          input,

+                                                          *desc,

+                                                          weights,

+                                                          EmptyOptional());

+                 }

+                 break;

+             }

+             case LayerType::ElementwiseBinary:

+             {

+                 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());

+                 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();

+                 GpuFsaElementwiseBinaryCreateOp(preCompiledBlobPtr, input0, input1, *desc);

+                 break;

+             }

+             case (LayerType::Pooling2d):

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());

+                 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);

+                 break;

+             }

+             case LayerType::Reshape:

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto desc = PolymorphicDowncast<const ReshapeDescriptor*>(&base.GetParameters());

+                 GpuFsaReshapeCreateOp(preCompiledBlobPtr, input, *desc);

+

+                 break;

+             }

+             case (LayerType::Resize):

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto desc = PolymorphicDowncast<const ResizeDescriptor*>(&base.GetParameters());

+                 GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc);

+                 break;

+             }

+             case (LayerType::Softmax):

+             {

+                 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();

+                 auto output = base.GetOutputSlot(0).GetTensorInfo();

+

+                 auto desc = PolymorphicDowncast<const SoftmaxDescriptor*>(&base.GetParameters());

+                 GpuFsaSoftmaxCreateOp(preCompiledBlobPtr,

+                                       input,

+                                       output,

+                                       *desc);

+                 break;

+             }

+             default:

+                 // unsupported layer for GpuFsa backend

+                 continue;

+         }

+

+         auto compiledBlob =

+             std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);

+

+         IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(

+             PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),

+             std::move(*compiledBlob),

+             armnn::Optional<BackendId>(GetId()),

+             "GpuFsa_Pre_Compiled_Layer");

+

+         // Copy the output tensor infos from sub-graph

+         for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)

+         {

+             preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());

+         }

+

+         SubgraphView::SubgraphViewPtr substituteSubgraph =

+             CreateSubgraphViewFrom(CreateInputsFrom(&base),

+                                    CreateOutputsFrom(&base),

+                                    {&base});

+

+         optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });

+

+         untouched.erase(base.GetGuid());

+     }

+

+     if (optimizationViews.GetSubstitutions().empty())

+     {

+         optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));

+     }

+     else

+     {

+         ReportUntouchedLayers(optimizationViews, untouched);

+     }

+

+

+     return optimizationViews;

+ }

+

+ } // namespace armnn

+