From 4cc341cf8b5a6e6bb0543504cbbfde6fa11a2cdb Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Fri, 7 Jul 2023 15:43:06 +0100 Subject: IVGCVSW-7830 Add backend optimizations to remove Reshapes where possible * Added optimization to remove reshapes for Neon and Ref Backends by using overridden TensorInfos * Added ability to delete Subgraphs during Optimization * Fixed naming error in NeonEndToEndTests and CLEndToEndTests * Added LayerNameAndTypeCheck for testing. * Fixed error where layers were not marked as altered when removed in CLBackend Signed-off-by: Mike Kelly Change-Id: I1ac25cd4ec9821470d961831ae2c8d24882276cc --- src/backends/cl/CMakeLists.txt | 1 + src/backends/cl/ClBackend.cpp | 6 +- src/backends/cl/ClTensorHandle.cpp | 82 ++++++++++++++ src/backends/cl/ClTensorHandle.hpp | 184 ++++++++++++++++++++++++++++++- src/backends/cl/backend.mk | 1 + src/backends/cl/test/ClEndToEndTests.cpp | 7 +- 6 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 src/backends/cl/ClTensorHandle.cpp (limited to 'src/backends/cl') diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt index 20c42061fc..cc71069910 100644 --- a/src/backends/cl/CMakeLists.txt +++ b/src/backends/cl/CMakeLists.txt @@ -39,6 +39,7 @@ if(ARMCOMPUTECL) ClLayerSupport.cpp ClLayerSupport.hpp ClRegistryInitializer.cpp + ClTensorHandle.cpp ClTensorHandle.hpp ClTensorHandleFactory.cpp ClTensorHandleFactory.hpp diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index a10b6fbb43..b018654288 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -455,6 +455,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, replacementLayer->m_Gamma = std::move(baseLayer->m_Gamma); replacementLayer->m_Mean = std::move(baseLayer->m_Mean); replacementLayer->m_Variance = std::move(baseLayer->m_Variance); + untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } @@ -476,6 +477,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, activationLayer, activationDesc, name); + untouched.erase(baseLayer->GetGuid()); untouched.erase(activationLayer->GetGuid()); } @@ -623,6 +625,8 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, activationDesc, BinaryOperation::Sub, name); + untouched.erase(baseLayer->GetGuid()); + untouched.erase(activationLayer->GetGuid()); } } // No fusion available for other BinaryOperations @@ -678,7 +682,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, } } - if (optimizationViews.GetSubstitutions().empty()) + if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty()) { optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); } diff --git a/src/backends/cl/ClTensorHandle.cpp b/src/backends/cl/ClTensorHandle.cpp new file mode 100644 index 0000000000..ccc8f6effc --- /dev/null +++ b/src/backends/cl/ClTensorHandle.cpp @@ -0,0 +1,82 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "ClTensorHandle.hpp" + +#include + +namespace armnn +{ + std::shared_ptr ClTensorHandle::DecorateTensorHandle(const TensorInfo& tensorInfo) + { + auto* parent = const_cast(this); + auto decorated = std::make_shared(parent, tensorInfo); + m_Decorated.emplace_back(decorated); + return decorated; + } + + ClTensorDecorator::ClTensorDecorator() + : m_Original(nullptr), m_TensorInfo() + { + } + + ClTensorDecorator::ClTensorDecorator(arm_compute::ICLTensor* original, const TensorInfo& tensorInfo) + : m_Original(nullptr), m_TensorInfo() + { + m_TensorInfo = armcomputetensorutils::BuildArmComputeTensorInfo(tensorInfo); + m_Original = original; + } + + arm_compute::ITensorInfo* ClTensorDecorator::info() const + { + return &m_TensorInfo; + } + + arm_compute::ITensorInfo* ClTensorDecorator::info() + { + return &m_TensorInfo; + } + + const cl::Buffer& ClTensorDecorator::cl_buffer() const + { + ARM_COMPUTE_ERROR_ON(m_Original == nullptr); + return m_Original->cl_buffer(); + } + + arm_compute::ICLTensor* ClTensorDecorator::parent() + { + return nullptr; + } + + arm_compute::CLQuantization ClTensorDecorator::quantization() const + { + return m_Original->quantization(); + } + + void ClTensorDecorator::map(bool blocking) + { + arm_compute::ICLTensor::map(arm_compute::CLScheduler::get().queue(), blocking); + } + + void ClTensorDecorator::unmap() + { + arm_compute::ICLTensor::unmap(arm_compute::CLScheduler::get().queue()); + } + + uint8_t* ClTensorDecorator::do_map(cl::CommandQueue& q, bool blocking) + { + if(m_Original->buffer() == nullptr) + { + m_Original->map(q, blocking); + } + return m_Original->buffer(); + } + + void ClTensorDecorator::do_unmap(cl::CommandQueue& q) + { + m_Original->unmap(q); + } + +} \ No newline at end of file diff --git a/src/backends/cl/ClTensorHandle.hpp b/src/backends/cl/ClTensorHandle.hpp index 3d750f9059..42657341fd 100644 --- a/src/backends/cl/ClTensorHandle.hpp +++ b/src/backends/cl/ClTensorHandle.hpp @@ -1,7 +1,8 @@ // -// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // + #pragma once #include @@ -22,6 +23,7 @@ namespace armnn { +class ClTensorHandleDecorator; class ClTensorHandle : public IClTensorHandle { @@ -122,7 +124,7 @@ public: virtual bool Import(void* memory, MemorySource source) override { armnn::IgnoreUnused(memory); - if (m_ImportFlags & static_cast(source)) + if (m_ImportFlags& static_cast(source)) { throw MemoryImportException("ClTensorHandle::Incorrect import flag"); } @@ -137,6 +139,8 @@ public: return false; } + virtual std::shared_ptr DecorateTensorHandle(const TensorInfo& tensorInfo) override; + private: // Only used for testing void CopyOutTo(void* memory) const override @@ -227,6 +231,7 @@ private: MemorySourceFlags m_ImportFlags; bool m_Imported; bool m_IsImportEnabled; + std::vector> m_Decorated; }; class ClSubTensorHandle : public IClTensorHandle @@ -361,4 +366,179 @@ private: ITensorHandle* parentHandle = nullptr; }; +/** ClTensorDecorator wraps an existing CL tensor allowing us to override the TensorInfo for it */ +class ClTensorDecorator : public arm_compute::ICLTensor +{ +public: + ClTensorDecorator(); + + ClTensorDecorator(arm_compute::ICLTensor* original, const TensorInfo& info); + + ~ClTensorDecorator() = default; + + ClTensorDecorator(const ClTensorDecorator&) = delete; + + ClTensorDecorator& operator=(const ClTensorDecorator&) = delete; + + ClTensorDecorator(ClTensorDecorator&&) = default; + + ClTensorDecorator& operator=(ClTensorDecorator&&) = default; + + arm_compute::ICLTensor* parent(); + + void map(bool blocking = true); + using arm_compute::ICLTensor::map; + + void unmap(); + using arm_compute::ICLTensor::unmap; + + virtual arm_compute::ITensorInfo* info() const override; + virtual arm_compute::ITensorInfo* info() override; + const cl::Buffer& cl_buffer() const override; + arm_compute::CLQuantization quantization() const override; + +protected: + // Inherited methods overridden: + uint8_t* do_map(cl::CommandQueue& q, bool blocking) override; + void do_unmap(cl::CommandQueue& q) override; + +private: + arm_compute::ICLTensor* m_Original; + mutable arm_compute::TensorInfo m_TensorInfo; +}; + +class ClTensorHandleDecorator : public IClTensorHandle +{ +public: + ClTensorHandleDecorator(IClTensorHandle* parent, const TensorInfo& info) + : m_Tensor(&parent->GetTensor(), info) + { + m_OriginalHandle = parent; + } + + arm_compute::ICLTensor& GetTensor() override { return m_Tensor; } + arm_compute::ICLTensor const& GetTensor() const override { return m_Tensor; } + + virtual void Allocate() override {} + virtual void Manage() override {} + + virtual const void* Map(bool blocking = true) const override + { + m_Tensor.map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + + virtual void Unmap() const override + { + m_Tensor.unmap(); + } + + virtual ITensorHandle* GetParent() const override { return nullptr; } + + virtual arm_compute::DataType GetDataType() const override + { + return m_Tensor.info()->data_type(); + } + + virtual void SetMemoryGroup(const std::shared_ptr&) override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + +private: + // Only used for testing + void CopyOutTo(void* memory) const override + { + const_cast(this)->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + const_cast(this)->Unmap(); + } + + // Only used for testing + void CopyInFrom(const void* memory) override + { + this->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + this->Unmap(); + } + + mutable ClTensorDecorator m_Tensor; + IClTensorHandle* m_OriginalHandle = nullptr; +}; + } // namespace armnn diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk index 03f1a9540d..f4b9fac740 100644 --- a/src/backends/cl/backend.mk +++ b/src/backends/cl/backend.mk @@ -23,6 +23,7 @@ BACKEND_SOURCES := \ ClImportTensorHandleFactory.cpp \ ClLayerSupport.cpp \ ClRegistryInitializer.cpp \ + ClTensorHandle.cpp \ ClTensorHandleFactory.cpp \ ClWorkloadFactory.cpp \ OpenClTimer.cpp \ diff --git a/src/backends/cl/test/ClEndToEndTests.cpp b/src/backends/cl/test/ClEndToEndTests.cpp index 091526fd2b..6342cbc48a 100644 --- a/src/backends/cl/test/ClEndToEndTests.cpp +++ b/src/backends/cl/test/ClEndToEndTests.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -60,18 +61,18 @@ TEST_CASE("ClAdditionEndToEndUint8Test") } // Power -TEST_CASE("RefPowerEndToEndTestFloat32") +TEST_CASE("ClPowerEndToEndTestFloat32") { ElementwiseBinarySimpleEndToEnd(clDefaultBackends, BinaryOperation::Power); } // SqDiff -TEST_CASE("RefSquaredDifferenceEndToEndTestFloat32") +TEST_CASE("ClSquaredDifferenceEndToEndTestFloat32") { ElementwiseBinarySimpleEndToEnd(clDefaultBackends, BinaryOperation::SqDiff); } -TEST_CASE("RefSquaredDifferenceEndToEndTestUint8") +TEST_CASE("ClSquaredDifferenceEndToEndTestUint8") { ElementwiseBinarySimpleEndToEnd(clDefaultBackends, BinaryOperation::SqDiff); } -- cgit v1.2.1