aboutsummaryrefslogtreecommitdiff
path: root/src/backends
diff options
context:
space:
mode:
Diffstat (limited to 'src/backends')
-rw-r--r--src/backends/aclCommon/CMakeLists.txt2
-rw-r--r--src/backends/aclCommon/common.cmake1
-rw-r--r--src/backends/aclCommon/test/CMakeLists.txt14
-rw-r--r--src/backends/aclCommon/test/CreateWorkloadClNeon.hpp107
-rw-r--r--src/backends/aclCommon/test/MemCopyTests.cpp62
-rw-r--r--src/backends/cl/CMakeLists.txt5
-rw-r--r--src/backends/cl/OpenClTimer.cpp105
-rw-r--r--src/backends/cl/OpenClTimer.hpp59
-rw-r--r--src/backends/cl/backend.mk1
-rw-r--r--src/backends/cl/test/CMakeLists.txt2
-rw-r--r--src/backends/cl/test/ClCreateWorkloadTests.cpp6
-rw-r--r--src/backends/cl/test/ClMemCopyTests.cpp39
-rw-r--r--src/backends/cl/test/Fp16SupportTest.cpp112
-rw-r--r--src/backends/cl/test/OpenClTimerTest.cpp143
-rw-r--r--src/backends/cl/workloads/ClWorkloadUtils.hpp4
-rw-r--r--src/backends/neon/CMakeLists.txt4
-rw-r--r--src/backends/neon/NeonInterceptorScheduler.cpp47
-rw-r--r--src/backends/neon/NeonInterceptorScheduler.hpp38
-rw-r--r--src/backends/neon/NeonTimer.cpp63
-rw-r--r--src/backends/neon/NeonTimer.hpp43
-rw-r--r--src/backends/neon/backend.mk2
-rw-r--r--src/backends/neon/test/CMakeLists.txt2
-rw-r--r--src/backends/neon/test/NeonCreateWorkloadTests.cpp5
-rw-r--r--src/backends/neon/test/NeonMemCopyTests.cpp39
-rw-r--r--src/backends/neon/test/NeonTimerTest.cpp105
-rw-r--r--src/backends/neon/workloads/NeonWorkloadUtils.hpp4
-rwxr-xr-xsrc/backends/test/LayerTests.cpp5
-rw-r--r--src/backends/test/MemCopyTestImpl.hpp84
-rw-r--r--src/backends/test/MemCopyTests.cpp180
29 files changed, 1090 insertions, 193 deletions
diff --git a/src/backends/aclCommon/CMakeLists.txt b/src/backends/aclCommon/CMakeLists.txt
index 42f914263a..6f99f4f146 100644
--- a/src/backends/aclCommon/CMakeLists.txt
+++ b/src/backends/aclCommon/CMakeLists.txt
@@ -9,6 +9,8 @@ list(APPEND armnnAclCommon_sources
ArmComputeUtils.hpp
)
+add_subdirectory(test)
+
add_library(armnnAclCommon STATIC ${armnnAclCommon_sources})
target_include_directories(armnnAclCommon PRIVATE ${PROJECT_SOURCE_DIR}/src)
target_include_directories(armnnAclCommon PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
diff --git a/src/backends/aclCommon/common.cmake b/src/backends/aclCommon/common.cmake
index d9d035f307..89be236a7f 100644
--- a/src/backends/aclCommon/common.cmake
+++ b/src/backends/aclCommon/common.cmake
@@ -6,4 +6,5 @@
if(ARMCOMPUTENEON OR ARMCOMPUTECL)
add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/aclCommon)
list(APPEND armnnLibraries armnnAclCommon)
+ list(APPEND armnnUnitTestLibraries armnnAclCommonUnitTests)
endif()
diff --git a/src/backends/aclCommon/test/CMakeLists.txt b/src/backends/aclCommon/test/CMakeLists.txt
new file mode 100644
index 0000000000..98008edeb5
--- /dev/null
+++ b/src/backends/aclCommon/test/CMakeLists.txt
@@ -0,0 +1,14 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnAclCommonUnitTests_sources
+ CreateWorkloadClNeon.hpp
+ MemCopyTests.cpp
+)
+
+add_library(armnnAclCommonUnitTests OBJECT ${armnnAclCommonUnitTests_sources})
+target_include_directories(armnnAclCommonUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(armnnAclCommonUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnAclCommonUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) \ No newline at end of file
diff --git a/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp b/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp
new file mode 100644
index 0000000000..a79cfe6b0d
--- /dev/null
+++ b/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp
@@ -0,0 +1,107 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/test/CreateWorkload.hpp>
+
+#include <backends/MemCopyWorkload.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
+
+#if ARMCOMPUTECL_ENABLED
+#include <backends/cl/ClTensorHandle.hpp>
+#endif
+
+#if ARMCOMPUTENEON_ENABLED
+#include <backends/neon/NeonTensorHandle.hpp>
+#endif
+
+using namespace armnn;
+
+namespace
+{
+
+using namespace std;
+
+template<typename IComputeTensorHandle>
+boost::test_tools::predicate_result CompareTensorHandleShape(IComputeTensorHandle* tensorHandle,
+ std::initializer_list<unsigned int> expectedDimensions)
+{
+ arm_compute::ITensorInfo* info = tensorHandle->GetTensor().info();
+
+ auto infoNumDims = info->num_dimensions();
+ auto numExpectedDims = expectedDimensions.size();
+ if (infoNumDims != numExpectedDims)
+ {
+ boost::test_tools::predicate_result res(false);
+ res.message() << "Different number of dimensions [" << info->num_dimensions()
+ << "!=" << expectedDimensions.size() << "]";
+ return res;
+ }
+
+ size_t i = info->num_dimensions() - 1;
+
+ for (unsigned int expectedDimension : expectedDimensions)
+ {
+ if (info->dimension(i) != expectedDimension)
+ {
+ boost::test_tools::predicate_result res(false);
+ res.message() << "Different dimension [" << info->dimension(i) << "!=" << expectedDimension << "]";
+ return res;
+ }
+
+ i--;
+ }
+
+ return true;
+}
+
+template<typename IComputeTensorHandle>
+void CreateMemCopyWorkloads(IWorkloadFactory& factory)
+{
+ Graph graph;
+ RefWorkloadFactory refFactory;
+
+ // Creates the layers we're testing.
+ Layer* const layer1 = graph.AddLayer<MemCopyLayer>("layer1");
+ Layer* const layer2 = graph.AddLayer<MemCopyLayer>("layer2");
+
+ // Creates extra layers.
+ Layer* const input = graph.AddLayer<InputLayer>(0, "input");
+ Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
+
+ // Connects up.
+ TensorInfo tensorInfo({2, 3}, DataType::Float32);
+ Connect(input, layer1, tensorInfo);
+ Connect(layer1, layer2, tensorInfo);
+ Connect(layer2, output, tensorInfo);
+
+ input->CreateTensorHandles(graph, refFactory);
+ layer1->CreateTensorHandles(graph, factory);
+ layer2->CreateTensorHandles(graph, refFactory);
+ output->CreateTensorHandles(graph, refFactory);
+
+ // make the workloads and check them
+ auto workload1 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer1, graph, factory);
+ auto workload2 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer2, graph, refFactory);
+
+ MemCopyQueueDescriptor queueDescriptor1 = workload1->GetData();
+ BOOST_TEST(queueDescriptor1.m_Inputs.size() == 1);
+ BOOST_TEST(queueDescriptor1.m_Outputs.size() == 1);
+ auto inputHandle1 = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor1.m_Inputs[0]);
+ auto outputHandle1 = boost::polymorphic_downcast<IComputeTensorHandle*>(queueDescriptor1.m_Outputs[0]);
+ BOOST_TEST((inputHandle1->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32)));
+ BOOST_TEST(CompareTensorHandleShape<IComputeTensorHandle>(outputHandle1, {2, 3}));
+
+
+ MemCopyQueueDescriptor queueDescriptor2 = workload2->GetData();
+ BOOST_TEST(queueDescriptor2.m_Inputs.size() == 1);
+ BOOST_TEST(queueDescriptor2.m_Outputs.size() == 1);
+ auto inputHandle2 = boost::polymorphic_downcast<IComputeTensorHandle*>(queueDescriptor2.m_Inputs[0]);
+ auto outputHandle2 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor2.m_Outputs[0]);
+ BOOST_TEST(CompareTensorHandleShape<IComputeTensorHandle>(inputHandle2, {2, 3}));
+ BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32)));
+}
+
+} //namespace \ No newline at end of file
diff --git a/src/backends/aclCommon/test/MemCopyTests.cpp b/src/backends/aclCommon/test/MemCopyTests.cpp
new file mode 100644
index 0000000000..8ecdb1014d
--- /dev/null
+++ b/src/backends/aclCommon/test/MemCopyTests.cpp
@@ -0,0 +1,62 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClWorkloadFactory.hpp>
+#include <backends/neon/NeonWorkloadFactory.hpp>
+#include <backends/test/MemCopyTestImpl.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(MemCopyTestSuite)
+
+BOOST_AUTO_TEST_CASE(AclTypeConversions)
+{
+ arm_compute::Strides strides(1, 2, 3, 4);
+ armnn::TensorShape convertedStrides = armnn::armcomputetensorutils::GetStrides(strides);
+
+ BOOST_TEST(convertedStrides[0] == 4);
+ BOOST_TEST(convertedStrides[1] == 3);
+ BOOST_TEST(convertedStrides[2] == 2);
+ BOOST_TEST(convertedStrides[3] == 1);
+
+ arm_compute::TensorShape shape(5, 6, 7, 8);
+ armnn::TensorShape convertedshape = armnn::armcomputetensorutils::GetShape(shape);
+
+ BOOST_TEST(convertedshape[0] == 8);
+ BOOST_TEST(convertedshape[1] == 7);
+ BOOST_TEST(convertedshape[2] == 6);
+ BOOST_TEST(convertedshape[3] == 5);
+}
+
+#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED
+
+BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndGpu)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::NeonWorkloadFactory, armnn::ClWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndNeon)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::NeonWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndGpuWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::NeonWorkloadFactory, armnn::ClWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndNeonWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::NeonWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt
index 04da6ddcff..2f32081dfe 100644
--- a/src/backends/cl/CMakeLists.txt
+++ b/src/backends/cl/CMakeLists.txt
@@ -15,6 +15,11 @@ list(APPEND armnnClBackend_sources
)
if(ARMCOMPUTECL)
+ list(APPEND armnnClBackend_sources
+ OpenClTimer.cpp
+ OpenClTimer.hpp
+ )
+
add_subdirectory(workloads)
add_subdirectory(test)
endif()
diff --git a/src/backends/cl/OpenClTimer.cpp b/src/backends/cl/OpenClTimer.cpp
new file mode 100644
index 0000000000..57552d7bd9
--- /dev/null
+++ b/src/backends/cl/OpenClTimer.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "OpenClTimer.hpp"
+
+#include <string>
+#include <sstream>
+
+namespace armnn
+{
+
+OpenClTimer::OpenClTimer()
+{
+}
+
+void OpenClTimer::Start()
+{
+ m_Kernels.clear();
+
+ auto interceptor = [this]( cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t *gwo,
+ const size_t *gws,
+ const size_t *lws,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+ {
+ cl_int retVal = 0;
+
+ // Get the name of the kernel
+ cl::Kernel retainedKernel(kernel, true);
+ std::stringstream ss;
+ ss << retainedKernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
+
+ // Embed workgroup sizes into the name
+ if(gws != nullptr)
+ {
+ ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
+ }
+ if(lws != nullptr)
+ {
+ ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
+ }
+
+ cl_event customEvent;
+
+ // Forward to original OpenCl function
+ retVal = m_OriginalEnqueueFunction( command_queue,
+ kernel,
+ work_dim,
+ gwo,
+ gws,
+ lws,
+ num_events_in_wait_list,
+ event_wait_list,
+ &customEvent);
+
+ // Store the Kernel info for later GetMeasurements() call
+ m_Kernels.emplace_back(ss.str(), customEvent);
+
+ return retVal;
+ };
+
+ m_OriginalEnqueueFunction = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
+}
+
+void OpenClTimer::Stop()
+{
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = m_OriginalEnqueueFunction;
+}
+
+std::vector<Measurement> OpenClTimer::GetMeasurements() const
+{
+ std::vector<Measurement> measurements;
+
+ cl_command_queue_properties clQueueProperties = CLScheduler::get().queue().getInfo<CL_QUEUE_PROPERTIES>();
+
+ int idx = 0;
+ for (auto& kernel : m_Kernels)
+ {
+ std::string name = std::string(this->GetName()) + "/" + std::to_string(idx++) + ": " + kernel.m_Name;
+
+ double timeUs = 0.0;
+ if((clQueueProperties & CL_QUEUE_PROFILING_ENABLE) != 0)
+ {
+ // Wait for the event to finish before accessing profile results.
+ kernel.m_Event.wait();
+
+ cl_ulong start = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ cl_ulong end = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ timeUs = static_cast<double>(end - start) / 1000.0;
+ }
+
+ measurements.emplace_back(name, timeUs, Measurement::Unit::TIME_US);
+ }
+
+ return measurements;
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/OpenClTimer.hpp b/src/backends/cl/OpenClTimer.hpp
new file mode 100644
index 0000000000..a7ae1387d9
--- /dev/null
+++ b/src/backends/cl/OpenClTimer.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Instrument.hpp"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/core/CL/OpenCL.h>
+
+#include <vector>
+#include <list>
+
+namespace armnn
+{
+
+/// OpenClTimer instrument that times all OpenCl kernels executed between calls to Start() and Stop().
+class OpenClTimer : public Instrument
+{
+public:
+ OpenClTimer();
+ ~OpenClTimer() = default;
+
+ /// Start the OpenCl timer
+ void Start() override;
+
+ /// Stop the OpenCl timer
+ void Stop() override;
+
+ /// Get the name of the timer
+ /// \return Name of the timer
+ const char* GetName() const override { return "OpenClKernelTimer"; }
+
+ /// Get the recorded measurements. This will be a list of the execution durations for all the OpenCl kernels.
+ /// \return Recorded measurements
+ std::vector<Measurement> GetMeasurements() const override;
+
+private:
+ using CLScheduler = arm_compute::CLScheduler;
+ using CLSymbols = arm_compute::CLSymbols;
+ using ClEvent = cl::Event;
+ using ClEnqueueFunc = decltype(CLSymbols::clEnqueueNDRangeKernel_ptr);
+
+ /// Stores info about the OpenCl kernel
+ struct KernelInfo
+ {
+ KernelInfo(const std::string& name, cl_event& event) : m_Name(name), m_Event(event) {}
+
+ std::string m_Name;
+ ClEvent m_Event;
+ };
+
+ std::list<KernelInfo> m_Kernels; ///< List of all kernels executed
+ ClEnqueueFunc m_OriginalEnqueueFunction; ///< Keep track of original OpenCl function
+};
+
+} //namespace armnn \ No newline at end of file
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 4375d9496c..205f7b5415 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -12,6 +12,7 @@ BACKEND_SOURCES := \
ClContextControl.cpp \
ClLayerSupport.cpp \
ClWorkloadFactory.cpp \
+ OpenClTimer.cpp \
workloads/ClActivationWorkload.cpp \
workloads/ClAdditionWorkload.cpp \
workloads/ClBatchNormalizationFloatWorkload.cpp \
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
index d365290a6c..4936a78645 100644
--- a/src/backends/cl/test/CMakeLists.txt
+++ b/src/backends/cl/test/CMakeLists.txt
@@ -8,6 +8,8 @@ list(APPEND armnnClBackendUnitTests_sources
ClCreateWorkloadTests.cpp
ClLayerSupportTests.cpp
ClLayerTests.cpp
+ ClMemCopyTests.cpp
+ OpenClTimerTest.cpp
)
add_library(armnnClBackendUnitTests OBJECT ${armnnClBackendUnitTests_sources})
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index 66c2c2aa40..526dc68fc5 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -6,13 +6,15 @@
#include "ClContextControlFixture.hpp"
#include <backends/MemCopyWorkload.hpp>
+
+#include <backends/aclCommon/test/CreateWorkloadClNeon.hpp>
+
#include <backends/cl/ClTensorHandle.hpp>
#include <backends/cl/ClWorkloadFactory.hpp>
#include <backends/cl/workloads/ClWorkloads.hpp>
#include <backends/cl/workloads/ClWorkloadUtils.hpp>
-#include <backends/reference/RefWorkloadFactory.hpp>
-#include <test/CreateWorkloadClNeon.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle* tensorHandle,
std::initializer_list<unsigned int> expectedDimensions)
diff --git a/src/backends/cl/test/ClMemCopyTests.cpp b/src/backends/cl/test/ClMemCopyTests.cpp
new file mode 100644
index 0000000000..af8a36d6c0
--- /dev/null
+++ b/src/backends/cl/test/ClMemCopyTests.cpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backends/cl/ClWorkloadFactory.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
+
+#include <backends/test/MemCopyTestImpl.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(ClMemCopy)
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpu)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpuWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpuWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/test/Fp16SupportTest.cpp b/src/backends/cl/test/Fp16SupportTest.cpp
new file mode 100644
index 0000000000..90bef3647b
--- /dev/null
+++ b/src/backends/cl/test/Fp16SupportTest.cpp
@@ -0,0 +1,112 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/INetwork.hpp>
+#include <armnnUtils/Half.hpp>
+
+#include <Graph.hpp>
+#include <Optimizer.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/test/QuantizeHelper.hpp>
+
+#include <boost/core/ignore_unused.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <set>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Fp16Support)
+
+BOOST_AUTO_TEST_CASE(Fp16DataTypeSupport)
+{
+ Graph graph;
+
+ Layer* const inputLayer1 = graph.AddLayer<InputLayer>(1, "input1");
+ Layer* const inputLayer2 = graph.AddLayer<InputLayer>(2, "input2");
+
+ Layer* const additionLayer = graph.AddLayer<AdditionLayer>("addition");
+ Layer* const outputLayer = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+ TensorInfo fp16TensorInfo({1, 2, 3, 5}, armnn::DataType::Float16);
+ inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+ inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+ additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+ inputLayer1->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+ inputLayer2->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+ additionLayer->GetOutputSlot().SetTensorInfo(fp16TensorInfo);
+
+ BOOST_CHECK(inputLayer1->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+ BOOST_CHECK(inputLayer2->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+ BOOST_CHECK(additionLayer->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16);
+}
+
+BOOST_AUTO_TEST_CASE(Fp16AdditionTest)
+{
+ using namespace half_float::literal;
+ // Create runtime in which test will run
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* inputLayer1 = net->AddInputLayer(0);
+ IConnectableLayer* inputLayer2 = net->AddInputLayer(1);
+ IConnectableLayer* additionLayer = net->AddAdditionLayer();
+ IConnectableLayer* outputLayer = net->AddOutputLayer(0);
+
+ inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0));
+ inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1));
+ additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+
+ //change to float16
+ TensorInfo fp16TensorInfo(TensorShape({4}), DataType::Float16);
+ inputLayer1->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+ inputLayer2->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+ additionLayer->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo);
+
+ // optimize the network
+ std::vector<Compute> backends = {Compute::GpuAcc};
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+ // Loads it into the runtime.
+ NetworkId netId;
+ runtime->LoadNetwork(netId, std::move(optNet));
+
+ std::vector<Half> input1Data
+ {
+ 1.0_h, 2.0_h, 3.0_h, 4.0_h
+ };
+
+ std::vector<Half> input2Data
+ {
+ 100.0_h, 200.0_h, 300.0_h, 400.0_h
+ };
+
+ InputTensors inputTensors
+ {
+ {0,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input1Data.data())},
+ {1,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input2Data.data())}
+ };
+
+ std::vector<Half> outputData(input1Data.size());
+ OutputTensors outputTensors
+ {
+ {0,Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
+ };
+
+ // Does the inference.
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Checks the results.
+ BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/test/OpenClTimerTest.cpp b/src/backends/cl/test/OpenClTimerTest.cpp
new file mode 100644
index 0000000000..70ceac2a3f
--- /dev/null
+++ b/src/backends/cl/test/OpenClTimerTest.cpp
@@ -0,0 +1,143 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#if (defined(__aarch64__)) || (defined(__x86_64__)) // disable test failing on FireFly/Armv7
+
+#include <armnn/test/TensorHelpers.hpp>
+
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/WorkloadFactory.hpp>
+
+#include <backends/cl/ClContextControl.hpp>
+#include <backends/cl/ClWorkloadFactory.hpp>
+#include <backends/cl/OpenClTimer.hpp>
+
+#include <backends/test/TensorCopyUtils.hpp>
+#include <backends/test/WorkloadTestUtils.hpp>
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <boost/format.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <iostream>
+
+using namespace armnn;
+
+struct OpenClFixture
+{
+ // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case.
+ // NOTE: Profiling needs to be enabled in ClContextControl to be able to obtain execution
+ // times from OpenClTimer.
+ OpenClFixture() : m_ClContextControl(nullptr, true) {}
+ ~OpenClFixture() {}
+
+ ClContextControl m_ClContextControl;
+};
+
+BOOST_FIXTURE_TEST_SUITE(OpenClTimerBatchNorm, OpenClFixture)
+using FactoryType = ClWorkloadFactory;
+
+BOOST_AUTO_TEST_CASE(OpenClTimerBatchNorm)
+{
+ ClWorkloadFactory workloadFactory;
+
+ const unsigned int width = 2;
+ const unsigned int height = 3;
+ const unsigned int channels = 2;
+ const unsigned int num = 1;
+ int32_t qOffset = 0;
+ float qScale = 0.f;
+
+ TensorInfo inputTensorInfo({num, channels, height, width}, GetDataType<float>());
+ TensorInfo outputTensorInfo({num, channels, height, width}, GetDataType<float>());
+ TensorInfo tensorInfo({channels}, GetDataType<float>());
+
+ // Set quantization parameters if the requested type is a quantized type.
+ if(IsQuantizedType<float>())
+ {
+ inputTensorInfo.SetQuantizationScale(qScale);
+ inputTensorInfo.SetQuantizationOffset(qOffset);
+ outputTensorInfo.SetQuantizationScale(qScale);
+ outputTensorInfo.SetQuantizationOffset(qOffset);
+ tensorInfo.SetQuantizationScale(qScale);
+ tensorInfo.SetQuantizationOffset(qOffset);
+ }
+
+ auto input = MakeTensor<float, 4>(inputTensorInfo,
+ QuantizedVector<float>(qScale, qOffset,
+ {
+ 1.f, 4.f,
+ 4.f, 2.f,
+ 1.f, 6.f,
+
+ 1.f, 1.f,
+ 4.f, 1.f,
+ -2.f, 4.f
+ }));
+ // these values are per-channel of the input
+ auto mean = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, -2}));
+ auto variance = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {4, 9}));
+ auto beta = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, 2}));
+ auto gamma = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {2, 1}));
+
+ std::unique_ptr<ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ BatchNormalizationQueueDescriptor data;
+ WorkloadInfo info;
+ ScopedCpuTensorHandle meanTensor(tensorInfo);
+ ScopedCpuTensorHandle varianceTensor(tensorInfo);
+ ScopedCpuTensorHandle betaTensor(tensorInfo);
+ ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+ AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+ AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+ AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+ AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+ AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+ AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+ data.m_Mean = &meanTensor;
+ data.m_Variance = &varianceTensor;
+ data.m_Beta = &betaTensor;
+ data.m_Gamma = &gammaTensor;
+ data.m_Parameters.m_Eps = 0.0f;
+
+ // for each channel:
+ // substract mean, divide by standard deviation (with an epsilon to avoid div by 0)
+ // multiply by gamma and add beta
+ std::unique_ptr<IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info);
+
+ inputHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+ OpenClTimer openClTimer;
+
+ BOOST_CHECK_EQUAL(openClTimer.GetName(), "OpenClKernelTimer");
+
+ //Start the timer
+ openClTimer.Start();
+
+ //Execute the workload
+ workload->Execute();
+
+ //Stop the timer
+ openClTimer.Stop();
+
+ BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().size(), 1);
+
+ BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().front().m_Name,
+ "OpenClKernelTimer/0: batchnormalization_layer_nchw GWS[1,3,2]");
+
+ BOOST_CHECK(openClTimer.GetMeasurements().front().m_Value > 0);
+
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+#endif //aarch64 or x86_64 \ No newline at end of file
diff --git a/src/backends/cl/workloads/ClWorkloadUtils.hpp b/src/backends/cl/workloads/ClWorkloadUtils.hpp
index af4ccd0bb8..c765c63dce 100644
--- a/src/backends/cl/workloads/ClWorkloadUtils.hpp
+++ b/src/backends/cl/workloads/ClWorkloadUtils.hpp
@@ -5,11 +5,11 @@
#pragma once
#include <armnnUtils/Half.hpp>
+
#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/OpenClTimer.hpp>
#include <backends/CpuTensorHandle.hpp>
-#include "OpenClTimer.hpp"
-
#define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \
ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \
name, \
diff --git a/src/backends/neon/CMakeLists.txt b/src/backends/neon/CMakeLists.txt
index 93c7955a5f..152955aa06 100644
--- a/src/backends/neon/CMakeLists.txt
+++ b/src/backends/neon/CMakeLists.txt
@@ -7,11 +7,15 @@ if(ARMCOMPUTENEON)
list(APPEND armnnNeonBackend_sources
NeonBackend.cpp
NeonBackend.hpp
+ NeonInterceptorScheduler.hpp
+ NeonInterceptorScheduler.cpp
NeonLayerSupport.cpp
NeonLayerSupport.hpp
NeonWorkloadFactory.cpp
NeonWorkloadFactory.hpp
NeonTensorHandle.hpp
+ NeonTimer.hpp
+ NeonTimer.cpp
)
add_subdirectory(workloads)
diff --git a/src/backends/neon/NeonInterceptorScheduler.cpp b/src/backends/neon/NeonInterceptorScheduler.cpp
new file mode 100644
index 0000000000..03b4670296
--- /dev/null
+++ b/src/backends/neon/NeonInterceptorScheduler.cpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonInterceptorScheduler.hpp"
+
+#include <boost/assert.hpp>
+
+namespace armnn{
+
+NeonInterceptorScheduler::NeonInterceptorScheduler(arm_compute::IScheduler &realScheduler)
+ : m_Kernels(nullptr), m_RealScheduler(realScheduler)
+{
+}
+
+void NeonInterceptorScheduler::set_num_threads(unsigned int numThreads)
+{
+ m_RealScheduler.set_num_threads(numThreads);
+}
+
+unsigned int NeonInterceptorScheduler::num_threads() const
+{
+ return m_RealScheduler.num_threads();
+}
+
+void NeonInterceptorScheduler::schedule(arm_compute::ICPPKernel* kernel, const Hints& hints)
+{
+ WallClockTimer::clock::time_point startTime = WallClockTimer::clock::now();
+ m_RealScheduler.schedule(kernel, hints.split_dimension());
+ WallClockTimer::clock::time_point stopTime = WallClockTimer::clock::now();
+
+ const auto delta = std::chrono::duration<double, std::micro>(stopTime - startTime);
+ m_Kernels->emplace_back(kernel->name(), delta.count(), Measurement::Unit::TIME_US);
+}
+
+void NeonInterceptorScheduler::run_workloads(std::vector <Workload>& workloads)
+{
+ WallClockTimer::clock::time_point startTime = WallClockTimer::clock::now();
+ m_RealScheduler.run_tagged_workloads(workloads, nullptr);
+ WallClockTimer::clock::time_point stopTime = WallClockTimer::clock::now();
+
+ const auto delta = std::chrono::duration<double, std::micro>(stopTime - startTime);
+ m_Kernels->emplace_back(std::string("Workload"), delta.count(), Measurement::Unit::TIME_US);
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/neon/NeonInterceptorScheduler.hpp b/src/backends/neon/NeonInterceptorScheduler.hpp
new file mode 100644
index 0000000000..f33b79a2da
--- /dev/null
+++ b/src/backends/neon/NeonInterceptorScheduler.hpp
@@ -0,0 +1,38 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "NeonTimer.hpp"
+#include "WallClockTimer.hpp"
+
+#include <arm_compute/runtime/IScheduler.h>
+#include <arm_compute/runtime/Scheduler.h>
+#include <arm_compute/core/CPP/ICPPKernel.h>
+
+namespace armnn
+{
+
+class NeonInterceptorScheduler : public arm_compute::IScheduler
+{
+public:
+ NeonInterceptorScheduler(arm_compute::IScheduler &realScheduler);
+ ~NeonInterceptorScheduler() = default;
+
+ void set_num_threads(unsigned int numThreads) override;
+
+ unsigned int num_threads() const override;
+
+ void schedule(arm_compute::ICPPKernel *kernel, const Hints &hints) override;
+
+ void run_workloads(std::vector<Workload> &workloads) override;
+
+ void SetKernels(NeonTimer::KernelMeasurements* kernels) { m_Kernels = kernels; }
+ NeonTimer::KernelMeasurements* GetKernels() { return m_Kernels; }
+private:
+ NeonTimer::KernelMeasurements* m_Kernels;
+ arm_compute::IScheduler& m_RealScheduler;
+};
+
+} // namespace armnn
diff --git a/src/backends/neon/NeonTimer.cpp b/src/backends/neon/NeonTimer.cpp
new file mode 100644
index 0000000000..219edc9680
--- /dev/null
+++ b/src/backends/neon/NeonTimer.cpp
@@ -0,0 +1,63 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonTimer.hpp"
+#include "NeonInterceptorScheduler.hpp"
+
+#include <memory>
+
+#include <boost/assert.hpp>
+#include <boost/format.hpp>
+
+namespace armnn
+{
+namespace
+{
+static thread_local auto g_Interceptor = std::make_shared<NeonInterceptorScheduler>(arm_compute::Scheduler::get());
+}
+
+void NeonTimer::Start()
+{
+ m_Kernels.clear();
+ BOOST_ASSERT(g_Interceptor->GetKernels() == nullptr);
+ g_Interceptor->SetKernels(&m_Kernels);
+
+ m_RealSchedulerType = arm_compute::Scheduler::get_type();
+ //Note: We can't currently replace a custom scheduler
+ if(m_RealSchedulerType != arm_compute::Scheduler::Type::CUSTOM)
+ {
+ // Keep the real schedule and add NeonInterceptorScheduler as an interceptor
+ m_RealScheduler = &arm_compute::Scheduler::get();
+ arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(g_Interceptor));
+ }
+}
+
+void NeonTimer::Stop()
+{
+ // Restore real scheduler
+ g_Interceptor->SetKernels(nullptr);
+ arm_compute::Scheduler::set(m_RealSchedulerType);
+ m_RealScheduler = nullptr;
+}
+
+std::vector<Measurement> NeonTimer::GetMeasurements() const
+{
+ std::vector<Measurement> measurements = m_Kernels;
+ unsigned int kernel_number = 0;
+ for (auto & kernel : measurements)
+ {
+ std::string kernelName = std::string(this->GetName()) + "/" + std::to_string(kernel_number++) + ": " + kernel
+ .m_Name;
+ kernel.m_Name = kernelName;
+ }
+ return measurements;
+}
+
+const char* NeonTimer::GetName() const
+{
+ return "NeonKernelTimer";
+}
+
+}
diff --git a/src/backends/neon/NeonTimer.hpp b/src/backends/neon/NeonTimer.hpp
new file mode 100644
index 0000000000..31d3e85a7c
--- /dev/null
+++ b/src/backends/neon/NeonTimer.hpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Instrument.hpp"
+
+#include <arm_compute/runtime/IScheduler.h>
+#include <arm_compute/runtime/Scheduler.h>
+#include <arm_compute/core/CPP/ICPPKernel.h>
+
+#include <chrono>
+#include <map>
+#include <list>
+
+namespace armnn
+{
+
+class NeonTimer : public Instrument
+{
+public:
+ using KernelMeasurements = std::vector<Measurement>;
+
+ NeonTimer() = default;
+ ~NeonTimer() = default;
+
+ void Start() override;
+
+ void Stop() override;
+
+ std::vector<Measurement> GetMeasurements() const override;
+
+ const char* GetName() const override;
+
+private:
+ KernelMeasurements m_Kernels;
+ arm_compute::IScheduler* m_RealScheduler;
+ arm_compute::Scheduler::Type m_RealSchedulerType;
+};
+
+} \ No newline at end of file
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 132328ba72..af83fb1321 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -9,7 +9,9 @@
BACKEND_SOURCES := \
NeonBackend.cpp \
+ NeonInterceptorScheduler.cpp \
NeonLayerSupport.cpp \
+ NeonTimer.cpp \
NeonWorkloadFactory.cpp \
workloads/NeonActivationWorkload.cpp \
workloads/NeonAdditionFloatWorkload.cpp \
diff --git a/src/backends/neon/test/CMakeLists.txt b/src/backends/neon/test/CMakeLists.txt
index 87da01e9e2..4a3380c3f9 100644
--- a/src/backends/neon/test/CMakeLists.txt
+++ b/src/backends/neon/test/CMakeLists.txt
@@ -7,6 +7,8 @@ list(APPEND armnnNeonBackendUnitTests_sources
NeonCreateWorkloadTests.cpp
NeonLayerSupportTests.cpp
NeonLayerTests.cpp
+ NeonMemCopyTests.cpp
+ NeonTimerTest.cpp
)
add_library(armnnNeonBackendUnitTests OBJECT ${armnnNeonBackendUnitTests_sources})
diff --git a/src/backends/neon/test/NeonCreateWorkloadTests.cpp b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
index 2c4d0ae0f9..ec8fe803a1 100644
--- a/src/backends/neon/test/NeonCreateWorkloadTests.cpp
+++ b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
@@ -3,9 +3,10 @@
// SPDX-License-Identifier: MIT
//
-#include <armnn/test/CreateWorkloadClNeon.hpp>
-
#include <backends/MemCopyWorkload.hpp>
+
+#include <backends/aclCommon/test/CreateWorkloadClNeon.hpp>
+
#include <backends/neon/NeonWorkloadFactory.hpp>
#include <backends/neon/NeonTensorHandle.hpp>
#include <backends/neon/workloads/NeonWorkloadUtils.hpp>
diff --git a/src/backends/neon/test/NeonMemCopyTests.cpp b/src/backends/neon/test/NeonMemCopyTests.cpp
new file mode 100644
index 0000000000..ddb47343a1
--- /dev/null
+++ b/src/backends/neon/test/NeonMemCopyTests.cpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backends/neon/NeonWorkloadFactory.hpp>
+#include <backends/reference/RefWorkloadFactory.hpp>
+
+#include <backends/test/MemCopyTestImpl.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(NeonMemCopy)
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndNeon)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::NeonWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndCpu)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::NeonWorkloadFactory, armnn::RefWorkloadFactory>(false);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndNeonWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::RefWorkloadFactory, armnn::NeonWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndCpuWithSubtensors)
+{
+ LayerTestResult<float, 4> result = MemCopyTest<armnn::NeonWorkloadFactory, armnn::RefWorkloadFactory>(true);
+ BOOST_TEST(CompareTensors(result.output, result.outputExpected));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/test/NeonTimerTest.cpp b/src/backends/neon/test/NeonTimerTest.cpp
new file mode 100644
index 0000000000..06f19c6ec3
--- /dev/null
+++ b/src/backends/neon/test/NeonTimerTest.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/ArmNN.hpp>
+
+#include <armnn/test/TensorHelpers.hpp>
+
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/WorkloadFactory.hpp>
+
+#include <backends/neon/NeonTimer.hpp>
+#include <backends/neon/NeonWorkloadFactory.hpp>
+
+#include <backends/test/LayerTests.hpp>
+#include <backends/test/TensorCopyUtils.hpp>
+#include <backends/test/WorkloadTestUtils.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+#include <cstdlib>
+#include <algorithm>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(NeonTimerInstrument)
+
+
+BOOST_AUTO_TEST_CASE(NeonTimerGetName)
+{
+ NeonTimer neonTimer;
+ BOOST_CHECK_EQUAL(neonTimer.GetName(), "NeonKernelTimer");
+}
+
+BOOST_AUTO_TEST_CASE(NeonTimerMeasure)
+{
+ NeonWorkloadFactory workloadFactory;
+
+ unsigned int inputWidth = 4000u;
+ unsigned int inputHeight = 5000u;
+ unsigned int inputChannels = 1u;
+ unsigned int inputBatchSize = 1u;
+
+ float upperBound = 1.0f;
+ float lowerBound = -1.0f;
+
+ size_t inputSize = inputWidth * inputHeight * inputChannels * inputBatchSize;
+ std::vector<float> inputData(inputSize, 0.f);
+ std::generate(inputData.begin(), inputData.end(), [](){
+ return (static_cast<float>(rand()) / static_cast<float>(RAND_MAX / 3)) + 1.f; });
+
+ unsigned int outputWidth = inputWidth;
+ unsigned int outputHeight = inputHeight;
+ unsigned int outputChannels = inputChannels;
+ unsigned int outputBatchSize = inputBatchSize;
+
+ armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth },
+ armnn::GetDataType<float>());
+
+ armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth },
+ armnn::GetDataType<float>());
+
+ LayerTestResult<float, 4> result(inputTensorInfo);
+
+ auto input = MakeTensor<float, 4>(inputTensorInfo, inputData);
+
+ std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+ std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+ // Setup bounded ReLu
+ armnn::ActivationQueueDescriptor descriptor;
+ armnn::WorkloadInfo workloadInfo;
+ AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+ AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+ descriptor.m_Parameters.m_Function = armnn::ActivationFunction::BoundedReLu;
+ descriptor.m_Parameters.m_A = upperBound;
+ descriptor.m_Parameters.m_B = lowerBound;
+
+ std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo);
+
+ inputHandle->Allocate();
+ outputHandle->Allocate();
+
+ CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+ NeonTimer neonTimer;
+ // Start the timer.
+ neonTimer.Start();
+ // Execute the workload.
+ workload->Execute();
+ // Stop the timer.
+ neonTimer.Stop();
+
+ std::vector<Measurement> measurements = neonTimer.GetMeasurements();
+
+ BOOST_CHECK_EQUAL(measurements.size(), 2);
+ BOOST_CHECK_EQUAL(measurements[0].m_Name, "NeonKernelTimer/0: NEFillBorderKernel");
+ BOOST_CHECK(measurements[0].m_Value > 0.0);
+ BOOST_CHECK_EQUAL(measurements[1].m_Name, "NeonKernelTimer/1: NEActivationLayerKernel");
+ BOOST_CHECK(measurements[1].m_Value > 0.0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/workloads/NeonWorkloadUtils.hpp b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
index 22668f6f4b..c4accd6c89 100644
--- a/src/backends/neon/workloads/NeonWorkloadUtils.hpp
+++ b/src/backends/neon/workloads/NeonWorkloadUtils.hpp
@@ -5,9 +5,9 @@
#pragma once
#include <backends/Workload.hpp>
-#include <backends/neon/NeonTensorHandle.hpp>
-#include "NeonTimer.hpp"
+#include <backends/neon/NeonTensorHandle.hpp>
+#include <backends/neon/NeonTimer.hpp>
#include <arm_compute/core/Types.h>
#include <arm_compute/core/Helpers.h>
diff --git a/src/backends/test/LayerTests.cpp b/src/backends/test/LayerTests.cpp
index 4b50e4b5f9..d955e42c36 100755
--- a/src/backends/test/LayerTests.cpp
+++ b/src/backends/test/LayerTests.cpp
@@ -16,11 +16,6 @@
#include <backends/CpuTensorHandle.hpp>
#include <backends/WorkloadFactory.hpp>
-#ifdef ARMCOMPUTECL_ENABLED
-#include <backends/cl/ClTensorHandle.hpp>
-#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
-#endif
-
#include <algorithm>
#include <boost/cast.hpp>
diff --git a/src/backends/test/MemCopyTestImpl.hpp b/src/backends/test/MemCopyTestImpl.hpp
new file mode 100644
index 0000000000..dab7f47915
--- /dev/null
+++ b/src/backends/test/MemCopyTestImpl.hpp
@@ -0,0 +1,84 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "LayerTests.hpp"
+#include "TensorCopyUtils.hpp"
+#include "WorkloadTestUtils.hpp"
+
+#include <armnn/test/TensorHelpers.hpp>
+
+#include <boost/multi_array.hpp>
+
+namespace
+{
+
+LayerTestResult<float, 4> MemCopyTest(armnn::IWorkloadFactory& srcWorkloadFactory,
+ armnn::IWorkloadFactory& dstWorkloadFactory,
+ bool withSubtensors)
+{
+ const std::array<unsigned int, 4> shapeData = { { 1u, 1u, 6u, 5u } };
+ const armnn::TensorShape tensorShape(4, shapeData.data());
+ const armnn::TensorInfo tensorInfo(tensorShape, armnn::DataType::Float32);
+ boost::multi_array<float, 4> inputData = MakeTensor<float, 4>(tensorInfo, std::vector<float>(
+ {
+ 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+
+ 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
+
+ 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
+
+ 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+
+ 21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
+
+ 26.0f, 27.0f, 28.0f, 29.0f, 30.0f,
+ })
+ );
+
+ LayerTestResult<float, 4> ret(tensorInfo);
+ ret.outputExpected = inputData;
+
+ boost::multi_array<float, 4> outputData(shapeData);
+
+ auto inputTensorHandle = srcWorkloadFactory.CreateTensorHandle(tensorInfo);
+ auto outputTensorHandle = dstWorkloadFactory.CreateTensorHandle(tensorInfo);
+
+ AllocateAndCopyDataToITensorHandle(inputTensorHandle.get(), inputData.data());
+ outputTensorHandle->Allocate();
+
+ armnn::MemCopyQueueDescriptor memCopyQueueDesc;
+ armnn::WorkloadInfo workloadInfo;
+
+ const unsigned int origin[4] = {};
+
+ auto workloadInput = (withSubtensors && srcWorkloadFactory.SupportsSubTensors())
+ ? srcWorkloadFactory.CreateSubTensorHandle(*inputTensorHandle, tensorShape, origin)
+ : std::move(inputTensorHandle);
+ auto workloadOutput = (withSubtensors && dstWorkloadFactory.SupportsSubTensors())
+ ? dstWorkloadFactory.CreateSubTensorHandle(*outputTensorHandle, tensorShape, origin)
+ : std::move(outputTensorHandle);
+
+ AddInputToWorkload(memCopyQueueDesc, workloadInfo, tensorInfo, workloadInput.get());
+ AddOutputToWorkload(memCopyQueueDesc, workloadInfo, tensorInfo, workloadOutput.get());
+
+ dstWorkloadFactory.CreateMemCopy(memCopyQueueDesc, workloadInfo)->Execute();
+
+ CopyDataFromITensorHandle(outputData.data(), workloadOutput.get());
+ ret.output = outputData;
+
+ return ret;
+}
+
+template<typename SrcWorkloadFactory, typename DstWorkloadFactory>
+LayerTestResult<float, 4> MemCopyTest(bool withSubtensors)
+{
+ SrcWorkloadFactory srcWorkloadFactory;
+ DstWorkloadFactory dstWorkloadFactory;
+
+ return MemCopyTest(srcWorkloadFactory, dstWorkloadFactory, withSubtensors);
+}
+
+} // anonymous namespace
diff --git a/src/backends/test/MemCopyTests.cpp b/src/backends/test/MemCopyTests.cpp
deleted file mode 100644
index f66caffd92..0000000000
--- a/src/backends/test/MemCopyTests.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#include <boost/test/unit_test.hpp>
-#include <boost/multi_array.hpp>
-
-#include <armnn/ArmNN.hpp>
-#include <backends/reference/RefWorkloadFactory.hpp>
-#if ARMCOMPUTECL_ENABLED
-#include <backends/cl/ClWorkloadFactory.hpp>
-#endif
-#if ARMCOMPUTENEON_ENABLED
-#include <backends/neon/NeonWorkloadFactory.hpp>
-#endif
-#include <backends/CpuTensorHandle.hpp>
-#include <test/TensorHelpers.hpp>
-
-#include "TensorCopyUtils.hpp"
-#include "WorkloadTestUtils.hpp"
-
-#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
-#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
-#endif
-
-BOOST_AUTO_TEST_SUITE(MemCopyTestSuite)
-
-void MemCopyTest(armnn::IWorkloadFactory& srcWorkloadFactory, armnn::IWorkloadFactory& dstWorkloadFactory,
- bool withSubtensors)
-{
- const std::array<unsigned int, 4> shapeData = { { 1u, 1u, 6u, 5u } };
- const armnn::TensorShape tensorShape(4, shapeData.data());
- const armnn::TensorInfo tensorInfo(tensorShape, armnn::DataType::Float32);
- boost::multi_array<float, 4> inputData = MakeTensor<float, 4>(tensorInfo, std::vector<float>(
- {
- 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
-
- 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
-
- 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
-
- 16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
-
- 21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
-
- 26.0f, 27.0f, 28.0f, 29.0f, 30.0f,
- })
- );
-
- boost::multi_array<float, 4> outputData(shapeData);
-
- auto inputTensorHandle = srcWorkloadFactory.CreateTensorHandle(tensorInfo);
- auto outputTensorHandle = dstWorkloadFactory.CreateTensorHandle(tensorInfo);
-
- AllocateAndCopyDataToITensorHandle(inputTensorHandle.get(), inputData.data());
- outputTensorHandle->Allocate();
-
- armnn::MemCopyQueueDescriptor memCopyQueueDesc;
- armnn::WorkloadInfo workloadInfo;
-
- const unsigned int origin[4] = {};
-
- auto workloadInput = (withSubtensors && srcWorkloadFactory.SupportsSubTensors())
- ? srcWorkloadFactory.CreateSubTensorHandle(*inputTensorHandle, tensorShape, origin)
- : std::move(inputTensorHandle);
- auto workloadOutput = (withSubtensors && dstWorkloadFactory.SupportsSubTensors())
- ? dstWorkloadFactory.CreateSubTensorHandle(*outputTensorHandle, tensorShape, origin)
- : std::move(outputTensorHandle);
-
- AddInputToWorkload(memCopyQueueDesc, workloadInfo, tensorInfo, workloadInput.get());
- AddOutputToWorkload(memCopyQueueDesc, workloadInfo, tensorInfo, workloadOutput.get());
-
- dstWorkloadFactory.CreateMemCopy(memCopyQueueDesc, workloadInfo)->Execute();
-
- CopyDataFromITensorHandle(outputData.data(), workloadOutput.get());
-
- BOOST_TEST(CompareTensors(inputData, outputData));
-}
-
-template <typename SrcWorkloadFactory, typename DstWorkloadFactory>
-void MemCopyTest(bool withSubtensors)
-{
- SrcWorkloadFactory srcWorkloadFactory;
- DstWorkloadFactory dstWorkloadFactory;
- MemCopyTest(srcWorkloadFactory, dstWorkloadFactory, withSubtensors);
-}
-
-#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED
-
-BOOST_AUTO_TEST_CASE(AclTypeConversions)
-{
- arm_compute::Strides strides(1,2,3,4);
- armnn::TensorShape convertedStrides = armnn::armcomputetensorutils::GetStrides(strides);
- BOOST_TEST(convertedStrides[0] == 4);
- BOOST_TEST(convertedStrides[1] == 3);
- BOOST_TEST(convertedStrides[2] == 2);
- BOOST_TEST(convertedStrides[3] == 1);
-
- arm_compute::TensorShape shape(5,6,7,8);
- armnn::TensorShape convertedshape = armnn::armcomputetensorutils::GetShape(shape);
- BOOST_TEST(convertedshape[0] == 8);
- BOOST_TEST(convertedshape[1] == 7);
- BOOST_TEST(convertedshape[2] == 6);
- BOOST_TEST(convertedshape[3] == 5);
-}
-#endif
-
-#if ARMCOMPUTECL_ENABLED
-
-BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu)
-{
- MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpu)
-{
- MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpuWithSubtensors)
-{
- MemCopyTest<armnn::RefWorkloadFactory, armnn::ClWorkloadFactory>(true);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndCpuWithSubtensors)
-{
- MemCopyTest<armnn::ClWorkloadFactory, armnn::RefWorkloadFactory>(true);
-}
-
-#endif // ARMCOMPUTECL_ENABLED
-
-#if ARMCOMPUTENEON_ENABLED
-
-BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndNeon)
-{
- MemCopyTest<armnn::RefWorkloadFactory, armnn::NeonWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndCpu)
-{
- MemCopyTest<armnn::NeonWorkloadFactory, armnn::RefWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndNeonWithSubtensors)
-{
- MemCopyTest<armnn::RefWorkloadFactory, armnn::NeonWorkloadFactory>(true);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndCpuWithSubtensors)
-{
- MemCopyTest<armnn::NeonWorkloadFactory, armnn::RefWorkloadFactory>(true);
-}
-
-#endif // ARMCOMPUTENEON_ENABLED
-
-#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED
-
-BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndGpu)
-{
- MemCopyTest<armnn::NeonWorkloadFactory, armnn::ClWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndNeon)
-{
- MemCopyTest<armnn::ClWorkloadFactory, armnn::NeonWorkloadFactory>(false);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenNeonAndGpuWithSubtensors)
-{
- MemCopyTest<armnn::NeonWorkloadFactory, armnn::ClWorkloadFactory>(true);
-}
-
-BOOST_AUTO_TEST_CASE(CopyBetweenGpuAndNeonWithSubtensors)
-{
- MemCopyTest<armnn::ClWorkloadFactory, armnn::NeonWorkloadFactory>(true);
-}
-
-#endif
-
-BOOST_AUTO_TEST_SUITE_END()