6 files changed, 275 insertions, 6 deletions
diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt
index 20c42061fc..cc71069910 100644
--- a/src/backends/cl/CMakeLists.txt
+++ b/src/backends/cl/CMakeLists.txt
@@ -39,6 +39,7 @@ if(ARMCOMPUTECL)
         ClLayerSupport.cpp
         ClLayerSupport.hpp
         ClRegistryInitializer.cpp
+        ClTensorHandle.cpp
         ClTensorHandle.hpp
         ClTensorHandleFactory.cpp
         ClTensorHandleFactory.hpp
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index a10b6fbb43..b018654288 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -455,6 +455,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
                                     replacementLayer->m_Gamma    = std::move(baseLayer->m_Gamma);
                                     replacementLayer->m_Mean     = std::move(baseLayer->m_Mean);
                                     replacementLayer->m_Variance = std::move(baseLayer->m_Variance);
+
                                     untouched.erase(baseLayer->GetGuid());
                                     untouched.erase(activationLayer->GetGuid());
                                 }
@@ -476,6 +477,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
                                                                      activationLayer,
                                                                      activationDesc,
                                                                      name);
+
                                     untouched.erase(baseLayer->GetGuid());
                                     untouched.erase(activationLayer->GetGuid());
                                 }
@@ -623,6 +625,8 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
                                                                                            activationDesc,
                                                                                            BinaryOperation::Sub,
                                                                                            name);
+                                        untouched.erase(baseLayer->GetGuid());
+                                        untouched.erase(activationLayer->GetGuid());
                                     }
                                 }
                                 // No fusion available for other BinaryOperations
@@ -678,7 +682,7 @@ OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
         }
     }
 
-    if (optimizationViews.GetSubstitutions().empty())
+    if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty())
     {
         optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
     }
diff --git a/src/backends/cl/ClTensorHandle.cpp b/src/backends/cl/ClTensorHandle.cpp
new file mode 100644
index 0000000000..ccc8f6effc
--- /dev/null
+++ b/src/backends/cl/ClTensorHandle.cpp
@@ -0,0 +1,82 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClTensorHandle.hpp"
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+    std::shared_ptr<ITensorHandle> ClTensorHandle::DecorateTensorHandle(const TensorInfo& tensorInfo)
+    {
+        auto* parent = const_cast<ClTensorHandle*>(this);
+        auto decorated = std::make_shared<ClTensorHandleDecorator>(parent, tensorInfo);
+        m_Decorated.emplace_back(decorated);
+        return decorated;
+    }
+
+    ClTensorDecorator::ClTensorDecorator()
+    : m_Original(nullptr), m_TensorInfo()
+    {
+    }
+
+    ClTensorDecorator::ClTensorDecorator(arm_compute::ICLTensor* original, const TensorInfo& tensorInfo)
+    : m_Original(nullptr), m_TensorInfo()
+    {
+        m_TensorInfo = armcomputetensorutils::BuildArmComputeTensorInfo(tensorInfo);
+        m_Original = original;
+    }
+
+    arm_compute::ITensorInfo* ClTensorDecorator::info() const
+    {
+        return &m_TensorInfo;
+    }
+
+    arm_compute::ITensorInfo* ClTensorDecorator::info()
+    {
+        return &m_TensorInfo;
+    }
+
+    const cl::Buffer& ClTensorDecorator::cl_buffer() const
+    {
+        ARM_COMPUTE_ERROR_ON(m_Original == nullptr);
+        return m_Original->cl_buffer();
+    }
+
+    arm_compute::ICLTensor* ClTensorDecorator::parent()
+    {
+        return nullptr;
+    }
+
+    arm_compute::CLQuantization ClTensorDecorator::quantization() const
+    {
+        return m_Original->quantization();
+    }
+
+    void ClTensorDecorator::map(bool blocking)
+    {
+        arm_compute::ICLTensor::map(arm_compute::CLScheduler::get().queue(), blocking);
+    }
+
+    void ClTensorDecorator::unmap()
+    {
+        arm_compute::ICLTensor::unmap(arm_compute::CLScheduler::get().queue());
+    }
+
+    uint8_t* ClTensorDecorator::do_map(cl::CommandQueue& q, bool blocking)
+    {
+        if(m_Original->buffer() == nullptr)
+        {
+            m_Original->map(q, blocking);
+        }
+        return m_Original->buffer();
+    }
+
+    void ClTensorDecorator::do_unmap(cl::CommandQueue& q)
+    {
+        m_Original->unmap(q);
+    }
+
+}
+\ No newline at end of file
diff --git a/src/backends/cl/ClTensorHandle.hpp b/src/backends/cl/ClTensorHandle.hpp
index 3d750f9059..42657341fd 100644
--- a/src/backends/cl/ClTensorHandle.hpp
+++ b/src/backends/cl/ClTensorHandle.hpp
@@ -1,7 +1,8 @@
 //
-// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+
 #pragma once
 
 #include <aclCommon/ArmComputeTensorHandle.hpp>
@@ -22,6 +23,7 @@
 
 namespace armnn
 {
+class ClTensorHandleDecorator;
 
 class ClTensorHandle : public IClTensorHandle
 {
@@ -122,7 +124,7 @@ public:
     virtual bool Import(void* memory, MemorySource source) override
     {
         armnn::IgnoreUnused(memory);
-        if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+        if (m_ImportFlags& static_cast<MemorySourceFlags>(source))
         {
             throw MemoryImportException("ClTensorHandle::Incorrect import flag");
         }
@@ -137,6 +139,8 @@ public:
         return false;
     }
 
+    virtual std::shared_ptr<ITensorHandle> DecorateTensorHandle(const TensorInfo& tensorInfo) override;
+
 private:
     // Only used for testing
     void CopyOutTo(void* memory) const override
@@ -227,6 +231,7 @@ private:
     MemorySourceFlags m_ImportFlags;
     bool m_Imported;
     bool m_IsImportEnabled;
+    std::vector<std::shared_ptr<ClTensorHandleDecorator>> m_Decorated;
 };
 
 class ClSubTensorHandle : public IClTensorHandle
@@ -361,4 +366,179 @@ private:
     ITensorHandle* parentHandle = nullptr;
 };
 
+/** ClTensorDecorator wraps an existing CL tensor allowing us to override the TensorInfo for it */
+class ClTensorDecorator : public arm_compute::ICLTensor
+{
+public:
+    ClTensorDecorator();
+
+    ClTensorDecorator(arm_compute::ICLTensor* original, const TensorInfo& info);
+
+    ~ClTensorDecorator() = default;
+
+    ClTensorDecorator(const ClTensorDecorator&) = delete;
+
+    ClTensorDecorator& operator=(const ClTensorDecorator&) = delete;
+
+    ClTensorDecorator(ClTensorDecorator&&) = default;
+
+    ClTensorDecorator& operator=(ClTensorDecorator&&) = default;
+
+    arm_compute::ICLTensor* parent();
+
+    void map(bool blocking = true);
+    using arm_compute::ICLTensor::map;
+
+    void unmap();
+    using arm_compute::ICLTensor::unmap;
+
+    virtual arm_compute::ITensorInfo* info() const override;
+    virtual arm_compute::ITensorInfo* info() override;
+    const cl::Buffer& cl_buffer() const override;
+    arm_compute::CLQuantization quantization() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t* do_map(cl::CommandQueue& q, bool blocking) override;
+    void do_unmap(cl::CommandQueue& q) override;
+
+private:
+    arm_compute::ICLTensor*         m_Original;
+    mutable arm_compute::TensorInfo m_TensorInfo;
+};
+
+class ClTensorHandleDecorator : public IClTensorHandle
+{
+public:
+    ClTensorHandleDecorator(IClTensorHandle* parent, const TensorInfo& info)
+            : m_Tensor(&parent->GetTensor(), info)
+    {
+        m_OriginalHandle = parent;
+    }
+
+    arm_compute::ICLTensor& GetTensor() override { return m_Tensor; }
+    arm_compute::ICLTensor const& GetTensor() const override { return m_Tensor; }
+
+    virtual void Allocate() override {}
+    virtual void Manage() override {}
+
+    virtual const void* Map(bool blocking = true) const override
+    {
+        m_Tensor.map(blocking);
+        return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+    }
+
+    virtual void Unmap() const override
+    {
+        m_Tensor.unmap();
+    }
+
+    virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+    virtual arm_compute::DataType GetDataType() const override
+    {
+        return m_Tensor.info()->data_type();
+    }
+
+    virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+    TensorShape GetStrides() const override
+    {
+        return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+    }
+
+    TensorShape GetShape() const override
+    {
+        return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+    }
+
+private:
+    // Only used for testing
+    void CopyOutTo(void* memory) const override
+    {
+        const_cast<ClTensorHandleDecorator*>(this)->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<float*>(memory));
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<uint8_t*>(memory));
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<armnn::Half*>(memory));
+                break;
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int8_t*>(memory));
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int16_t*>(memory));
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int32_t*>(memory));
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        const_cast<ClTensorHandleDecorator*>(this)->Unmap();
+    }
+
+    // Only used for testing
+    void CopyInFrom(const void* memory) override
+    {
+        this->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        this->Unmap();
+    }
+
+    mutable ClTensorDecorator m_Tensor;
+    IClTensorHandle* m_OriginalHandle = nullptr;
+};
+
 } // namespace armnn
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 03f1a9540d..f4b9fac740 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -23,6 +23,7 @@ BACKEND_SOURCES := \
         ClImportTensorHandleFactory.cpp \
         ClLayerSupport.cpp \
         ClRegistryInitializer.cpp \
+        ClTensorHandle.cpp \
         ClTensorHandleFactory.cpp \
         ClWorkloadFactory.cpp \
         OpenClTimer.cpp \
diff --git a/src/backends/cl/test/ClEndToEndTests.cpp b/src/backends/cl/test/ClEndToEndTests.cpp
index 091526fd2b..6342cbc48a 100644
--- a/src/backends/cl/test/ClEndToEndTests.cpp
+++ b/src/backends/cl/test/ClEndToEndTests.cpp
@@ -24,6 +24,7 @@
 #include <backendsCommon/test/ReshapeEndToEndTestImpl.hpp>
 #include <backendsCommon/test/SpaceToDepthEndToEndTestImpl.hpp>
 #include <backendsCommon/test/SplitterEndToEndTestImpl.hpp>
+#include <backendsCommon/test/SubgraphUtilsTest.hpp>
 #include <backendsCommon/test/TransposeConvolution2dEndToEndTestImpl.hpp>
 #include <backendsCommon/test/TransposeEndToEndTestImpl.hpp>
 
@@ -60,18 +61,18 @@ TEST_CASE("ClAdditionEndToEndUint8Test")
 }
 
 // Power
-TEST_CASE("RefPowerEndToEndTestFloat32")
+TEST_CASE("ClPowerEndToEndTestFloat32")
 {
     ElementwiseBinarySimpleEndToEnd<armnn::DataType::Float32>(clDefaultBackends, BinaryOperation::Power);
 }
 
 // SqDiff
-TEST_CASE("RefSquaredDifferenceEndToEndTestFloat32")
+TEST_CASE("ClSquaredDifferenceEndToEndTestFloat32")
 {
     ElementwiseBinarySimpleEndToEnd<armnn::DataType::Float32>(clDefaultBackends, BinaryOperation::SqDiff);
 }
 
-TEST_CASE("RefSquaredDifferenceEndToEndTestUint8")
+TEST_CASE("ClSquaredDifferenceEndToEndTestUint8")
 {
     ElementwiseBinarySimpleEndToEnd<armnn::DataType::QAsymmU8>(clDefaultBackends, BinaryOperation::SqDiff);
 }