10 files changed, 472 insertions, 4 deletions
diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt
index 04da6ddcff..2f32081dfe 100644
--- a/src/backends/cl/CMakeLists.txt
+++ b/src/backends/cl/CMakeLists.txt
@@ -15,6 +15,11 @@ list(APPEND armnnClBackend_sources
 )
 
 if(ARMCOMPUTECL)
+    list(APPEND armnnClBackend_sources
+        OpenClTimer.cpp
+        OpenClTimer.hpp
+    )
+
     add_subdirectory(workloads)
     add_subdirectory(test)
 endif()
diff --git a/src/backends/cl/OpenClTimer.cpp b/src/backends/cl/OpenClTimer.cpp
new file mode 100644
index 0000000000..57552d7bd9
--- /dev/null
+++ b/src/backends/cl/OpenClTimer.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "OpenClTimer.hpp"
+
+#include <string>
+#include <sstream>
+
+namespace armnn
+{
+
+OpenClTimer::OpenClTimer()
+{
+}
+
+void OpenClTimer::Start()
+{
+    m_Kernels.clear();
+
+    auto interceptor = [this](  cl_command_queue command_queue,
+                                cl_kernel        kernel,
+                                cl_uint          work_dim,
+                                const size_t    *gwo,
+                                const size_t    *gws,
+                                const size_t    *lws,
+                                cl_uint          num_events_in_wait_list,
+                                const cl_event * event_wait_list,
+                                cl_event *       event)
+        {
+            cl_int retVal = 0;
+
+            // Get the name of the kernel
+            cl::Kernel retainedKernel(kernel, true);
+            std::stringstream ss;
+            ss << retainedKernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
+
+            // Embed workgroup sizes into the name
+            if(gws != nullptr)
+            {
+                ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
+            }
+            if(lws != nullptr)
+            {
+                ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
+            }
+
+            cl_event customEvent;
+
+            // Forward to original OpenCl function
+            retVal = m_OriginalEnqueueFunction( command_queue,
+                                                kernel,
+                                                work_dim,
+                                                gwo,
+                                                gws,
+                                                lws,
+                                                num_events_in_wait_list,
+                                                event_wait_list,
+                                                &customEvent);
+
+            // Store the Kernel info for later GetMeasurements() call
+            m_Kernels.emplace_back(ss.str(), customEvent);
+
+            return retVal;
+        };
+
+    m_OriginalEnqueueFunction = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
+}
+
+void OpenClTimer::Stop()
+{
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = m_OriginalEnqueueFunction;
+}
+
+std::vector<Measurement> OpenClTimer::GetMeasurements() const
+{
+    std::vector<Measurement> measurements;
+
+    cl_command_queue_properties clQueueProperties = CLScheduler::get().queue().getInfo<CL_QUEUE_PROPERTIES>();
+
+    int idx = 0;
+    for (auto& kernel : m_Kernels)
+    {
+        std::string name = std::string(this->GetName()) + "/" + std::to_string(idx++) + ": " + kernel.m_Name;
+
+        double timeUs = 0.0;
+        if((clQueueProperties & CL_QUEUE_PROFILING_ENABLE) != 0)
+        {
+            // Wait for the event to finish before accessing profile results.
+            kernel.m_Event.wait();
+
+            cl_ulong start = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+            cl_ulong end   = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+            timeUs = static_cast<double>(end - start) / 1000.0;
+        }
+
+        measurements.emplace_back(name, timeUs, Measurement::Unit::TIME_US);
+    }
+
+    return measurements;
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/OpenClTimer.hpp b/src/backends/cl/OpenClTimer.hpp
new file mode 100644
index 0000000000..a7ae1387d9
--- /dev/null
+++ b/src/backends/cl/OpenClTimer.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Instrument.hpp"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/core/CL/OpenCL.h>
+
+#include <vector>
+#include <list>
+
+namespace armnn
+{
+
+/// OpenClTimer instrument that times all OpenCl kernels executed between calls to Start() and Stop().
+class OpenClTimer : public Instrument
+{
+public:
+    OpenClTimer();
+    ~OpenClTimer() = default;
+
+    /// Start the OpenCl timer
+    void Start() override;
+
+    /// Stop the OpenCl timer
+    void Stop() override;
+
+    /// Get the name of the timer
+    /// \return Name of the timer
+    const char* GetName() const override { return "OpenClKernelTimer"; }
+
+    /// Get the recorded measurements. This will be a list of the execution durations for all the OpenCl kernels.
+    /// \return Recorded measurements
+    std::vector<Measurement> GetMeasurements() const override;
+
+private:
+    using CLScheduler = arm_compute::CLScheduler;
+    using CLSymbols = arm_compute::CLSymbols;
+    using ClEvent = cl::Event;
+    using ClEnqueueFunc = decltype(CLSymbols::clEnqueueNDRangeKernel_ptr);
+
+    /// Stores info about the OpenCl kernel
+    struct KernelInfo
+    {
+        KernelInfo(const std::string& name, cl_event& event) : m_Name(name), m_Event(event) {}
+
+        std::string m_Name;
+        ClEvent m_Event;
+    };
+
+    std::list<KernelInfo>                m_Kernels; ///< List of all kernels executed
+    ClEnqueueFunc                        m_OriginalEnqueueFunction; ///< Keep track of original OpenCl function
+};
+
+} //namespace armnn
+\ No newline at end of file
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 4375d9496c..205f7b5415 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -12,6 +12,7 @@ BACKEND_SOURCES := \
         ClContextControl.cpp \
         ClLayerSupport.cpp \
         ClWorkloadFactory.cpp \
+        OpenClTimer.cpp \
         workloads/ClActivationWorkload.cpp \
         workloads/ClAdditionWorkload.cpp \
         workloads/ClBatchNormalizationFloatWorkload.cpp \
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
index d365290a6c..4936a78645 100644
--- a/src/backends/cl/test/CMakeLists.txt
+++ b/src/backends/cl/test/CMakeLists.txt
@@ -8,6 +8,8 @@ list(APPEND armnnClBackendUnitTests_sources
     ClCreateWorkloadTests.cpp
     ClLayerSupportTests.cpp
     ClLayerTests.cpp
+    ClMemCopyTests.cpp
+    OpenClTimerTest.cpp
 )
 
 add_library(armnnClBackendUnitTests OBJECT ${armnnClBackendUnitTests_sources})
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index 66c2c2aa40..526dc68fc5 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -6,13 +6,15 @@
 #include "ClContextControlFixture.hpp"
 
 #include <backends/MemCopyWorkload.hpp>
+
+#include <backends/aclCommon/test/CreateWorkloadClNeon.hpp>
+
 #include <backends/cl/ClTensorHandle.hpp>
 #include <backends/cl/ClWorkloadFactory.hpp>
 #include <backends/cl/workloads/ClWorkloads.hpp>
 #include <backends/cl/workloads/ClWorkloadUtils.hpp>
-#include <backends/reference/RefWorkloadFactory.hpp>
 
-#include <test/CreateWorkloadClNeon.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
 
 boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle*                    tensorHandle,
                                                                 std::initializer_list<unsigned int> expectedDimensions)
diff --git a/src/backends/cl/test/ClMemCopyTests.cpp b/src/backends/cl/test/ClMemCopyTests.cpp
new file mode 100644
index 0000000000..af8a36d6c0
--- /dev/null
+++ b/src/backends/cl/test/ClMemCopyTests.cpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backends/cl/ClWorkloadFactory.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
+
+#include <backends/test/MemCopyTestImpl.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(ClMemCopy)
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu)
+{
+    LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(false);
+    BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpu)
+{
+    LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(false);
+    BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpuWithSubtensors)
+{
+    LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(true);
+    BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpuWithSubtensors)
+{
+    LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(true);
+    BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/test/Fp16SupportTest.cpp b/src/backends/cl/test/Fp16SupportTest.cpp
new file mode 100644
index 0000000000..90bef3647b
--- /dev/null
+++ b/src/backends/cl/test/Fp16SupportTest.cpp
@@ -0,0 +1,112 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/INetwork.hpp>
+#include <armnnUtils/Half.hpp>
+
+#include <Graph.hpp>
+#include <Optimizer.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/test/QuantizeHelper.hpp>
+
+#include <boost/core/ignore_unused.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <set>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Fp16Support)
+
+BOOST_AUTO_TEST_CASE(Fp16DataTypeSupport)
+{
+    Graph graph;
+
+    Layer* const inputLayer1 = graph.AddLayer<InputLayer>(1, "input1");
+    Layer* const inputLayer2 = graph.AddLayer<InputLayer>(2, "input2");
+
+    Layer* const additionLayer = graph.AddLayer<AdditionLayer>("addition");
+    Layer* const outputLayer = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    TensorInfo fp16TensorInfo({1, 2, 3, 5}, armnn::DataType::Float16);
+    inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+    inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+    additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+    inputLayer1->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+    inputLayer2->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+    additionLayer->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+
+    BOOST_CHECK(inputLayer1->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+    BOOST_CHECK(inputLayer2->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+    BOOST_CHECK(additionLayer->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+}
+
+BOOST_AUTO_TEST_CASE(Fp16AdditionTest)
+{
+   using namespace half_float::literal;
+   // Create runtime in which test will run
+   IRuntime::CreationOptions options;
+   IRuntimePtr runtime(IRuntime::Create(options));
+
+   // Builds up the structure of the network.
+   INetworkPtr net(INetwork::Create());
+
+   IConnectableLayer* inputLayer1 = net->AddInputLayer(0);
+   IConnectableLayer* inputLayer2 = net->AddInputLayer(1);
+   IConnectableLayer* additionLayer = net->AddAdditionLayer();
+   IConnectableLayer* outputLayer = net->AddOutputLayer(0);
+
+   inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+   inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+   additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+   //change to float16
+   TensorInfo fp16TensorInfo(TensorShape({4}), DataType::Float16);
+   inputLayer1->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+   inputLayer2->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+   additionLayer->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+
+   // optimize the network
+   std::vector<Compute> backends = {Compute::GpuAcc};
+   IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+   // Loads it into the runtime.
+   NetworkId netId;
+   runtime->LoadNetwork(netId, std::move(optNet));
+
+   std::vector<Half> input1Data
+   {
+       1.0_h, 2.0_h, 3.0_h, 4.0_h
+   };
+
+   std::vector<Half> input2Data
+   {
+       100.0_h, 200.0_h, 300.0_h, 400.0_h
+   };
+
+   InputTensors inputTensors
+   {
+       {0,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input1Data.data())},
+       {1,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input2Data.data())}
+   };
+
+   std::vector<Half> outputData(input1Data.size());
+   OutputTensors outputTensors
+   {
+       {0,Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
+   };
+
+   // Does the inference.
+   runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+   // Checks the results.
+   BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/test/OpenClTimerTest.cpp b/src/backends/cl/test/OpenClTimerTest.cpp
new file mode 100644
index 0000000000..70ceac2a3f
--- /dev/null
+++ b/src/backends/cl/test/OpenClTimerTest.cpp
@@ -0,0 +1,143 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#if (defined(__aarch64__)) || (defined(__x86_64__)) // disable test failing on FireFly/Armv7
+
+#include <armnn/test/TensorHelpers.hpp>
+
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/WorkloadFactory.hpp>
+
+#include <backends/cl/ClContextControl.hpp>
+#include <backends/cl/ClWorkloadFactory.hpp>
+#include <backends/cl/OpenClTimer.hpp>
+
+#include <backends/test/TensorCopyUtils.hpp>
+#include <backends/test/WorkloadTestUtils.hpp>
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <boost/format.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+
+using namespace armnn;
+
+struct OpenClFixture
+{
+    // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case.
+    // NOTE: Profiling needs to be enabled in ClContextControl to be able to obtain execution
+    // times from OpenClTimer.
+    OpenClFixture() : m_ClContextControl(nullptr, true) {}
+    ~OpenClFixture() {}
+
+    ClContextControl m_ClContextControl;
+};
+
+BOOST_FIXTURE_TEST_SUITE(OpenClTimerBatchNorm, OpenClFixture)
+using FactoryType = ClWorkloadFactory;
+
+BOOST_AUTO_TEST_CASE(OpenClTimerBatchNorm)
+{
+    ClWorkloadFactory  workloadFactory;
+
+    const unsigned int width    = 2;
+    const unsigned int height   = 3;
+    const unsigned int channels = 2;
+    const unsigned int num      = 1;
+    int32_t qOffset = 0;
+    float qScale = 0.f;
+
+    TensorInfo inputTensorInfo({num, channels, height, width}, GetDataType<float>());
+    TensorInfo outputTensorInfo({num, channels, height, width}, GetDataType<float>());
+    TensorInfo tensorInfo({channels}, GetDataType<float>());
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(IsQuantizedType<float>())
+    {
+         inputTensorInfo.SetQuantizationScale(qScale);
+         inputTensorInfo.SetQuantizationOffset(qOffset);
+         outputTensorInfo.SetQuantizationScale(qScale);
+         outputTensorInfo.SetQuantizationOffset(qOffset);
+         tensorInfo.SetQuantizationScale(qScale);
+         tensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo,
+    QuantizedVector<float>(qScale, qOffset,
+    {
+        1.f, 4.f,
+        4.f, 2.f,
+        1.f, 6.f,
+
+        1.f, 1.f,
+        4.f, 1.f,
+        -2.f, 4.f
+    }));
+    // these values are per-channel of the input
+    auto mean     = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, -2}));
+    auto variance = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {4, 9}));
+    auto beta     = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, 2}));
+    auto gamma    = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {2, 1}));
+
+    std::unique_ptr<ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    BatchNormalizationQueueDescriptor data;
+    WorkloadInfo info;
+    ScopedCpuTensorHandle meanTensor(tensorInfo);
+    ScopedCpuTensorHandle varianceTensor(tensorInfo);
+    ScopedCpuTensorHandle betaTensor(tensorInfo);
+    ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+    AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+    AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+    AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+    AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Mean             = &meanTensor;
+    data.m_Variance         = &varianceTensor;
+    data.m_Beta             = &betaTensor;
+    data.m_Gamma            = &gammaTensor;
+    data.m_Parameters.m_Eps = 0.0f;
+
+    // for each channel:
+    // substract mean, divide by standard deviation (with an epsilon to avoid div by 0)
+    // multiply by gamma and add beta
+    std::unique_ptr<IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    OpenClTimer openClTimer;
+
+    BOOST_CHECK_EQUAL(openClTimer.GetName(), "OpenClKernelTimer");
+
+    //Start the timer
+    openClTimer.Start();
+
+    //Execute the workload
+    workload->Execute();
+
+    //Stop the timer
+    openClTimer.Stop();
+
+    BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().size(), 1);
+
+    BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().front().m_Name,
+                      "OpenClKernelTimer/0: batchnormalization_layer_nchw GWS[1,3,2]");
+
+    BOOST_CHECK(openClTimer.GetMeasurements().front().m_Value > 0);
+
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+#endif //aarch64 or x86_64
+\ No newline at end of file
diff --git a/src/backends/cl/workloads/ClWorkloadUtils.hpp b/src/backends/cl/workloads/ClWorkloadUtils.hpp
index af4ccd0bb8..c765c63dce 100644
--- a/src/backends/cl/workloads/ClWorkloadUtils.hpp
+++ b/src/backends/cl/workloads/ClWorkloadUtils.hpp
@@ -5,11 +5,11 @@
 #pragma once
 
 #include <armnnUtils/Half.hpp>
+
 #include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/OpenClTimer.hpp>
 #include <backends/CpuTensorHandle.hpp>
 
-#include "OpenClTimer.hpp"
-
 #define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \
     ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \
                                                   name, \