7 files changed, 288 insertions, 6 deletions
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index 8abb16ccca..0fc5da78d1 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -133,6 +133,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     MemorySourceFlags inputFlags,
     MemorySourceFlags outputFlags) const
 {
+    // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+    if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+    if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
     std::shared_ptr<ClMemoryManager> memoryManager;
     if (m_UsingCustomAllocator)
     {
@@ -193,6 +202,15 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis
                                               MemorySourceFlags inputFlags,
                                               MemorySourceFlags outputFlags)
 {
+    // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+    if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+    if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
     std::shared_ptr<ClMemoryManager> memoryManager;
     if (m_UsingCustomAllocator)
     {
diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp
index a236a70d7c..54710d8135 100644
--- a/src/backends/cl/ClImportTensorHandle.hpp
+++ b/src/backends/cl/ClImportTensorHandle.hpp
@@ -205,7 +205,11 @@ public:
                 // We do this to match the behaviour of the Import function later on.
                 auto cachelineAlignment =
                         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
-                auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+                auto roundedSize = totalBytes;
+                if (totalBytes % cachelineAlignment != 0)
+                {
+                    roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+                }
 
                 cl_int error = CL_SUCCESS;
                 cl_mem buffer;
@@ -252,7 +256,11 @@ private:
         // This does not change the size of the buffer, only the size of the mapping the buffer is mapped to
         auto cachelineAlignment =
                 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
-        auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+        auto roundedSize = totalBytes;
+        if (totalBytes % cachelineAlignment != 0)
+        {
+            roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+        }
 
         cl_int error = CL_SUCCESS;
         cl_mem buffer;
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index d8b2d4f786..4a28205ade 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -11,11 +11,14 @@
 #include <armnn/utility/PolymorphicDowncast.hpp>
 #include <armnn/backends/MemCopyWorkload.hpp>
 #include <armnnTestUtils/TensorCopyUtils.hpp>
+#include <TensorHelpers.hpp>
 #include <armnnTestUtils/WorkloadTestUtils.hpp>
 
 #include <aclCommon/test/CreateWorkloadClNeon.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 
+#include <cl/ClImportTensorHandle.hpp>
+#include <cl/ClImportTensorHandleFactory.hpp>
 #include <cl/ClTensorHandle.hpp>
 #include <cl/ClWorkloadFactory.hpp>
 #include <cl/workloads/ClWorkloads.hpp>
@@ -355,6 +358,64 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dFastMathEnabledWo
     ARMNN_ASSERT(conv2dWorkload->GetConvolutionMethod() == arm_compute::ConvolutionMethod::WINOGRAD);
 }
 
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClReplaceInputOutputConvolution2dWorkload")
+{
+    // Create Convolution2dWorkload with ClTensorHandle input and output
+    // Then replace the input and output with ClImportTensorHandle
+    Graph graph;
+    ClWorkloadFactory factory =
+        ClWorkloadFactoryHelper::GetFactory(ClWorkloadFactoryHelper::GetMemoryManager());
+
+    auto workload =
+        CreateConvolution2dWorkloadTest<ClConvolution2dWorkload, DataType::Float32>(factory,
+                                                                                    graph,
+                                                                                    DataLayout::NHWC);
+
+    TensorShape inputShape  = std::initializer_list<unsigned int>({2, 8, 16, 3});
+    TensorShape outputShape = std::initializer_list<unsigned int>({2, 2, 10, 2});
+
+    // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
+    Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle  = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto outputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]);
+    CHECK((inputHandle->GetShape() == inputShape));
+    CHECK((outputHandle->GetShape() == outputShape));
+    // The input and output handles are created correctly as ClTensorHandle
+    CHECK((dynamic_cast<ClTensorHandle*>(inputHandle) != nullptr));
+    CHECK((dynamic_cast<ClTensorHandle*>(outputHandle) != nullptr));
+
+    // Replace with ImportTensorHandle
+    ClImportTensorHandleFactory importFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
+                                              static_cast<MemorySourceFlags>(MemorySource::Malloc));
+
+    TensorInfo inputInfo({ 2, 8, 16, 3 }, DataType::Float32);
+    TensorInfo outputInfo({ 2, 2, 10, 2 }, DataType::Float32);
+
+    // create TensorHandle for memory import
+    auto inputImportHandle = importFactory.CreateTensorHandle(inputInfo);
+    auto outputImportHandle = importFactory.CreateTensorHandle(outputInfo);
+
+    // Calling ReplaceInputTensorHandle and ReplaceOutputTensorHandle does not throw exception
+    // as Reconfigure function is implemented
+    workload->ReplaceInputTensorHandle(inputImportHandle.get(), 0);
+    workload->ReplaceOutputTensorHandle(outputImportHandle.get(), 0);
+
+    // Correctly replaced with the import handles with correct information
+    queueDescriptor = workload->GetData();
+    auto replacedInputHandle  = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto replacedOutputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]);
+    CHECK((replacedInputHandle->GetShape() == inputShape));
+    CHECK((replacedOutputHandle->GetShape() == outputShape));
+
+    CHECK((inputImportHandle.get() == replacedInputHandle));
+    CHECK((inputImportHandle.get() == replacedInputHandle));
+
+    CHECK((dynamic_cast<ClTensorHandle*>(replacedInputHandle) == nullptr));
+    CHECK((dynamic_cast<ClImportTensorHandle*>(replacedInputHandle) != nullptr));
+    CHECK((dynamic_cast<ClTensorHandle*>(replacedOutputHandle) == nullptr));
+    CHECK((dynamic_cast<ClImportTensorHandle*>(replacedOutputHandle) != nullptr));
+}
+
 TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dClCompiledContextWorkload")
 {
     using namespace armnn;
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 3d702642aa..161765484d 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -274,4 +274,157 @@ TEST_CASE("ClCanBeImportedAlignedMemory")
     // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
 }
 
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
+{
+    // Create runtime in which test will run
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    // build up the structure of the network
+    INetworkPtr network(INetwork::Create());
+
+    armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
+    armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
+    armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
+
+    kernelInfo.SetConstant(true);
+
+    std::vector<float> kernel =
+    {
+        4, 5, 6,
+        0, 0, 0,
+        3, 2, 1
+    };
+
+    const std::vector<float> expectedOutput =
+    {
+        23, 41, 33, 21,
+        44, 65, 76, 52,
+        82, 85, 79, 42
+    };
+
+    unsigned int numElements = inputInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
+    ARMNN_ASSERT(inputLayer);
+
+    armnn::ConstTensor weights(kernelInfo, kernel);
+
+    armnn::Convolution2dDescriptor convDesc2d;
+    convDesc2d.m_StrideX = 1;
+    convDesc2d.m_StrideY = 1;
+    convDesc2d.m_PadLeft = 1;
+    convDesc2d.m_PadRight = 1;
+    convDesc2d.m_PadTop = 1;
+    convDesc2d.m_PadBottom = 1;
+    convDesc2d.m_DataLayout = DataLayout::NHWC;
+    armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
+                                                                          weights,
+                                                                          armnn::EmptyOptional(),
+                                                                          "conv");
+    ARMNN_ASSERT(convLayer);
+
+    inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+    inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+    IConnectableLayer* output = network->AddOutputLayer(0, "output");
+    convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+    convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+    // Optimize the network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = false;
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
+    CHECK(optNet);
+
+    // Loads it into the runtime.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    // Enable Importing
+    INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    const size_t alignment =
+        arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+    size_t space = totalBytes + alignment + alignment;
+    auto inputData = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr = inputData.get();
+    CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
+
+    // Input with negative values
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    inputPtr[0] = 1;
+    inputPtr[1] = 5;
+    inputPtr[2] = 2;
+    inputPtr[3] = 3;
+    inputPtr[4] = 8;
+    inputPtr[5] = 7;
+    inputPtr[6] = 3;
+    inputPtr[7] = 6;
+    inputPtr[8] = 3;
+    inputPtr[9] = 3;
+    inputPtr[10] = 9;
+    inputPtr[11] = 1;
+
+
+    auto outputData = std::make_unique<uint8_t[]>(space);
+    void* alignedOutputPtr = outputData.get();
+    CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
+    auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+    std::fill_n(outputPtr, numElements, -10.0f);
+
+    TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
+    inputTensorInfo.SetConstant(true);
+    InputTensors inputTensors
+    {
+        {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
+    };
+    OutputTensors outputTensors
+    {
+        {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    INFO("Run ImportInputs");
+    std::vector<ImportedInputId> importedInputIds =
+        runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+    std::vector<ImportedOutputId> importedOutputIds =
+        runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains Convolution2dWorkload
+    std::size_t found = dump.find("Convolution2dWorkload");
+    CHECK(found != std::string::npos);
+
+    // Contains SyncMemGeneric
+    found = dump.find("SyncMemGeneric");
+    CHECK(found != std::string::npos);
+
+    // Does not contain CopyMemGeneric
+    found = dump.find("CopyMemGeneric");
+    CHECK(found == std::string::npos);
+
+    runtime->UnloadNetwork(netId);
+
+    // Check output is as expected
+    // Validate result by checking that the output has no negative values
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+    CHECK(outputResult);
+
+    // Check the output is correct
+    CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
+}
+
 }
diff --git a/src/backends/cl/workloads/ClBaseWorkload.hpp b/src/backends/cl/workloads/ClBaseWorkload.hpp
index e74fc84f4f..03417e33ae 100644
--- a/src/backends/cl/workloads/ClBaseWorkload.hpp
+++ b/src/backends/cl/workloads/ClBaseWorkload.hpp
@@ -20,17 +20,38 @@ public:
     // Replace input tensor handle with the given TensorHandle and call Reconfigure()
     void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
     {
+        ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
         this->m_Data.m_Inputs[slot] = tensorHandle;
-        Reconfigure();
+        try
+        {
+            Reconfigure();
+        }
+        catch(armnn::UnimplementedException& e)
+        {
+            // Cannot reconfigure, revert the slot back and throw the exception.
+            this->m_Data.m_Inputs[slot] = backupHandle;
+            throw e;
+        }
     }
 
     // Replace output tensor handle with the given TensorHandle and call Reconfigure()
     void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
     {
+        ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot];
         this->m_Data.m_Outputs[slot] = tensorHandle;
-        Reconfigure();
+        try
+        {
+            Reconfigure();
+        }
+        catch(armnn::UnimplementedException& e)
+        {
+            // Cannot reconfigure, revert the slot back and throw the exception.
+            this->m_Data.m_Inputs[slot] = backupHandle;
+            throw e;
+        }
     }
 
+protected:
     // Reconfigure the workload configuration. Throw armnn::UnimplementedException by default.
     virtual void Reconfigure()
     {
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
index 705e92d307..cdfa885f67 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
@@ -90,6 +90,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
+    // Create Proxy tensor and set the initial tensor handle to it
+    m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+    m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
     arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
     input.info()->set_data_layout(aclDataLayout);
     output.info()->set_data_layout(aclDataLayout);
@@ -101,10 +105,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_configure");
         m_ConvolutionLayer.configure(clCompileContext,
-                                     &input,
+                                     m_InputProxy.get(),
                                      m_KernelTensor.get(),
                                      m_BiasTensor.get(),
-                                     &output,
+                                     m_OutputProxy.get(),
                                      padStrideInfo,
                                      arm_compute::WeightsInfo(),
                                      aclDilationInfo,
@@ -174,4 +178,13 @@ void ClConvolution2dWorkload::FreeUnusedTensors()
     FreeTensorIfUnused(m_BiasTensor);
 }
 
+void ClConvolution2dWorkload::Reconfigure()
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure");
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    m_InputProxy->set(&input);
+    m_OutputProxy->set(&output);
+}
+
 } //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
index 8a4599df47..891d5096cd 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
@@ -13,6 +13,8 @@
 #include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 
+#include <cl/ICLTensorProxy.hpp>
+
 #include <memory>
 
 namespace armnn
@@ -38,6 +40,9 @@ public:
 
     arm_compute::ConvolutionMethod GetConvolutionMethod() const;
 
+protected:
+    void Reconfigure() override;
+
 private:
     mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
 
@@ -47,6 +52,9 @@ private:
     arm_compute::ConvolutionMethod m_ConvolutionMethod;
 
     void FreeUnusedTensors();
+
+    std::unique_ptr<ICLTensorProxy> m_InputProxy;
+    std::unique_ptr<ICLTensorProxy> m_OutputProxy;
 };
 
 } //namespace armnn