From 60ab9765329b1449b509b32b07f0b0abb3b532f2 Mon Sep 17 00:00:00 2001
From: Nikhil Raj <nikhil.raj@arm.com>
Date: Thu, 13 Jan 2022 09:34:44 +0000
Subject: IVGCVSW-6673 Implement CanBeImported function to ClTensorHandle

 * Added Unittests

Signed-off-by: Nikhil Raj <nikhil.raj@arm.com>
Signed-off-by: David Monahan <David.Monahan@arm.com>
Change-Id: If7c0add39583a7e47b43fd79f93c620f86f80fc1
---
 src/backends/cl/ClImportTensorHandle.hpp           | 61 +++++++++++++++++++++-
 src/backends/cl/ClTensorHandle.hpp                 |  7 +++
 src/backends/cl/test/ClImportTensorHandleTests.cpp | 57 ++++++++++++++++++++
 3 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp
index a24ab5656e..18cd1ffeb4 100644
--- a/src/backends/cl/ClImportTensorHandle.hpp
+++ b/src/backends/cl/ClImportTensorHandle.hpp
@@ -192,12 +192,71 @@ public:
         }
     }
 
+    virtual bool CanBeImported(void* memory, MemorySource source) override
+    {
+        if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+        {
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                                CL_IMPORT_TYPE_ARM,
+                                CL_IMPORT_TYPE_HOST_ARM,
+                                0
+                        };
+
+                size_t totalBytes = m_Tensor.info()->total_size();
+
+                // Round the size of the mapping to match the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+                // This does not change the size of the buffer, only the size of the mapping the buffer is mapped to
+                // We do this to match the behaviour of the Import function later on.
+                auto cachelineAlignment =
+                        arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+                auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer;
+                buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                           CL_MEM_READ_WRITE, importProperties, memory, roundedSize, &error);
+
+                // If we fail to map we know the import will not succeed and can return false.
+                // There is no memory to be released if error is not CL_SUCCESS
+                if (error != CL_SUCCESS)
+                {
+                    return false;
+                }
+                else
+                {
+                    // If import was successful we can release the mapping knowing import will succeed at workload
+                    // execution and return true
+                    error = clReleaseMemObject(buffer);
+                    if (error == CL_SUCCESS)
+                    {
+                        return true;
+                    }
+                    else
+                    {
+                        // If we couldn't release the mapping this constitutes a memory leak and throw an exception
+                        throw MemoryImportException("ClImportTensorHandle::Failed to unmap cl_mem buffer: "
+                                                    + std::to_string(error));
+                    }
+                }
+            }
+        }
+        else
+        {
+            throw MemoryImportException("ClImportTensorHandle::Incorrect import flag");
+        }
+        return false;
+    }
+
 private:
     bool ClImport(const cl_import_properties_arm* importProperties, void* memory, bool isProtected = false)
     {
         size_t totalBytes = m_Tensor.info()->total_size();
 
-        // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+        // Round the size of the mapping to match the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+        // This does not change the size of the buffer, only the size of the mapping the buffer is mapped to
         auto cachelineAlignment =
                 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
         auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
diff --git a/src/backends/cl/ClTensorHandle.hpp b/src/backends/cl/ClTensorHandle.hpp
index 5720d2cf11..6fccb8d395 100644
--- a/src/backends/cl/ClTensorHandle.hpp
+++ b/src/backends/cl/ClTensorHandle.hpp
@@ -138,6 +138,13 @@ public:
         return false;
     }
 
+    virtual bool CanBeImported(void* memory, MemorySource source) override
+    {
+        // This TensorHandle can never import.
+        armnn::IgnoreUnused(memory, source);
+        return false;
+    }
+
 private:
     // Only used for testing
     void CopyOutTo(void* memory) const override
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 0403d5379e..3d702642aa 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -217,4 +217,61 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
     }
 }
 
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
+{
+    ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
+                                              static_cast<MemorySourceFlags>(MemorySource::Malloc));
+
+    TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
+
+    // create TensorHandle for memory import
+    auto handle = handleFactory.CreateTensorHandle(info);
+
+    // Get CLtensor
+    arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
+
+    // Allocate user memory
+    const size_t totalBytes = tensor.info()->total_size();
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+    size_t space = totalBytes + alignment + alignment;
+    auto testData = std::make_unique<uint8_t[]>(space);
+    void* alignedPtr = testData.get();
+    CHECK(std::align(alignment, totalBytes, alignedPtr, space));
+
+    // Import memory
+    CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
+
+}
+
+TEST_CASE("ClCanBeImportedAlignedMemory")
+{
+    ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
+                                              static_cast<MemorySourceFlags>(MemorySource::Malloc));
+
+    TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
+
+    // create TensorHandle (Memory Managed status is irrelevant)
+    auto handle = handleFactory.CreateTensorHandle(info);
+    // Get CLtensor
+    arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
+
+    // Create an aligned buffer
+    const size_t totalBytes = tensor.info()->total_size();
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+    size_t space = totalBytes + alignment + alignment;
+    auto testData = std::make_unique<uint8_t[]>(space);
+    void* alignedPtr = testData.get();
+    CHECK(std::align(alignment, totalBytes, alignedPtr, space));
+
+    // Check aligned buffers return true
+    CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
+
+    // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
+    // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
+    // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
+    // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
+}
+
 }
-- 
cgit v1.2.1