aboutsummaryrefslogtreecommitdiff
path: root/tests/benchmark
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2017-09-04 18:44:23 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 13:03:09 +0100
commit6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
treea7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /tests/benchmark
downloadComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
COMPMID-344 Updated doxygen
Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
Diffstat (limited to 'tests/benchmark')
-rw-r--r--tests/benchmark/CL/ActivationLayer.cpp212
-rw-r--r--tests/benchmark/CL/BitwiseAnd.cpp133
-rw-r--r--tests/benchmark/CL/CMakeLists.txt57
-rw-r--r--tests/benchmark/CL/ConvolutionLayer.cpp277
-rw-r--r--tests/benchmark/CL/FullyConnectedLayer.cpp116
-rw-r--r--tests/benchmark/CL/GEMM.cpp492
-rw-r--r--tests/benchmark/CL/GEMM.h102
-rw-r--r--tests/benchmark/CL/NormalizationLayer.cpp93
-rw-r--r--tests/benchmark/CL/PoolingLayer.cpp141
-rw-r--r--tests/benchmark/CMakeLists.txt100
-rw-r--r--tests/benchmark/Datasets.h79
-rw-r--r--tests/benchmark/Instrument.h107
-rw-r--r--tests/benchmark/NEON/ActivationLayer.cpp239
-rw-r--r--tests/benchmark/NEON/BitwiseAnd.cpp126
-rw-r--r--tests/benchmark/NEON/CMakeLists.txt37
-rw-r--r--tests/benchmark/NEON/ConvolutionLayer.cpp303
-rw-r--r--tests/benchmark/NEON/ConvolutionLayerDirect.cpp74
-rw-r--r--tests/benchmark/NEON/FullyConnectedLayer.cpp132
-rw-r--r--tests/benchmark/NEON/GEMM.cpp709
-rw-r--r--tests/benchmark/NEON/GEMM.h106
-rw-r--r--tests/benchmark/NEON/NormalizationLayer.cpp111
-rw-r--r--tests/benchmark/NEON/PoolingLayer.cpp162
-rw-r--r--tests/benchmark/PMUCounter.cpp144
-rw-r--r--tests/benchmark/PMUCounter.h71
-rw-r--r--tests/benchmark/PerformanceProgramOptions.cpp48
-rw-r--r--tests/benchmark/PerformanceProgramOptions.h45
-rw-r--r--tests/benchmark/PerformanceUserConfiguration.cpp45
-rw-r--r--tests/benchmark/PerformanceUserConfiguration.h57
-rw-r--r--tests/benchmark/Profiler.cpp87
-rw-r--r--tests/benchmark/Profiler.h76
-rw-r--r--tests/benchmark/WallClockTimer.cpp56
-rw-r--r--tests/benchmark/WallClockTimer.h53
-rw-r--r--tests/benchmark/common/ActivationLayer.h92
-rw-r--r--tests/benchmark/common/ConvolutionLayer.h107
-rw-r--r--tests/benchmark/common/FullyConnectedLayer.h108
-rw-r--r--tests/benchmark/common/NormalizationLayer.h96
-rw-r--r--tests/benchmark/common/PoolingLayer.h95
-rw-r--r--tests/benchmark/main.cpp96
-rw-r--r--tests/benchmark/system_tests/CL/AlexNet.cpp87
-rw-r--r--tests/benchmark/system_tests/CL/LeNet5.cpp82
-rw-r--r--tests/benchmark/system_tests/NEON/AlexNet.cpp120
-rw-r--r--tests/benchmark/system_tests/NEON/LeNet5.cpp80
-rw-r--r--tests/benchmark/system_tests/common/AlexNet.h95
-rw-r--r--tests/benchmark/system_tests/common/LeNet5.h82
44 files changed, 5730 insertions, 0 deletions
diff --git a/tests/benchmark/CL/ActivationLayer.cpp b/tests/benchmark/CL/ActivationLayer.cpp
new file mode 100644
index 0000000000..5180d3d900
--- /dev/null
+++ b/tests/benchmark/CL/ActivationLayer.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNet = ActivationLayer<AlexNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerLeNet5 = ActivationLayer<LeNet5ActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerGoogLeNet = ActivationLayer<GoogLeNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ActivationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);
diff --git a/tests/benchmark/CL/BitwiseAnd.cpp b/tests/benchmark/CL/BitwiseAnd.cpp
new file mode 100644
index 0000000000..a3deb3eb5b
--- /dev/null
+++ b/tests/benchmark/CL/BitwiseAnd.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ ::benchmark::Fixture::SetUp(state);
+
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const std::string image_name = *(DataSet().begin() + state.range(0));
+
+ // Create tensors
+ src1 = create_tensor(image_name, DataType::U8);
+ src2 = create_tensor(image_name, DataType::U8);
+ dst = create_tensor(image_name, DataType::U8);
+
+ // Create and configure function
+ band.configure(&src1, &src2, &dst);
+
+ // Allocate tensors
+ src1.allocator()->allocate();
+ src2.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill source tensors
+ library->fill(CLAccessor(src1), image_name, Channel::R);
+ library->fill(CLAccessor(src2), image_name, Channel::G);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ profiler.submit(state);
+
+ ::benchmark::Fixture::TearDown(state);
+ }
+
+ CLBitwiseAnd band{};
+ Profiler profiler{};
+
+private:
+ CLTensor src1{};
+ CLTensor src2{};
+ CLTensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, cl_bitwise_and)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ band.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, cl_bitwise_and)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ band.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);
diff --git a/tests/benchmark/CL/CMakeLists.txt b/tests/benchmark/CL/CMakeLists.txt
new file mode 100644
index 0000000000..8493309f40
--- /dev/null
+++ b/tests/benchmark/CL/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+include_directories(${CMAKE_SOURCE_DIR}/../include)
+
+set(arm_compute_test_benchmark_TARGET_DEFINITIONS
+ ${arm_compute_test_benchmark_TARGET_DEFINITIONS}
+ -DOPENCL
+ PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_INCLUDES
+ ${arm_compute_test_benchmark_TARGET_INCLUDES}
+ ${CMAKE_SOURCE_DIR}/../include
+ PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_OPENCL_SOURCE_FILES
+ ${CMAKE_SOURCE_DIR}/CL/CLAccessor.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/Bitwise/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_benchmark_OPENCL OBJECT
+ ${arm_compute_test_benchmark_OPENCL_SOURCE_FILES}
+)
+
+set(arm_compute_test_benchmark_TARGET_OBJECTS
+ ${arm_compute_test_benchmark_TARGET_OBJECTS}
+ $<TARGET_OBJECTS:arm_compute_test_benchmark_OPENCL>
+ PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_LIBRARIES
+ ${arm_compute_test_benchmark_TARGET_LIBRARIES}
+ OpenCL
+ PARENT_SCOPE
+)
diff --git a/tests/benchmark/CL/ConvolutionLayer.cpp b/tests/benchmark/CL/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..e1f4fabdc3
--- /dev/null
+++ b/tests/benchmark/CL/ConvolutionLayer.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNet = ConvolutionLayer<AlexNetConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerLeNet5 = ConvolutionLayer<LeNet5ConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, CLTensor, CLAccessor, CLConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);
diff --git a/tests/benchmark/CL/FullyConnectedLayer.cpp b/tests/benchmark/CL/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..6e8c89fa0b
--- /dev/null
+++ b/tests/benchmark/CL/FullyConnectedLayer.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNet = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerLeNet5 = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
diff --git a/tests/benchmark/CL/GEMM.cpp b/tests/benchmark/CL/GEMM.cpp
new file mode 100644
index 0000000000..b90556df48
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.cpp
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/CL/GEMM.h"
+
+namespace
+{
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+} // namespace
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
diff --git a/tests/benchmark/CL/GEMM.h b/tests/benchmark/CL/GEMM.h
new file mode 100644
index 0000000000..02a339609c
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+// FIXME: Merge with NEON/GEMM.h into common/GEMM.h after adding F16 support to NEON GEMM and QS8 support to CL GEMM
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32, "Unsupported data type for GEMM operation");
+
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+ TensorShape shape_a = gemm_obj.shape_a;
+ TensorShape shape_b = gemm_obj.shape_b;
+ TensorShape shape_c = gemm_obj.shape_c;
+ TensorShape shape_d = gemm_obj.shape_d;
+
+ // Create tensors
+ a = create_tensor(shape_a, data_type);
+ b = create_tensor(shape_b, data_type);
+ c = create_tensor(shape_c, data_type);
+ d = create_tensor(shape_d, data_type);
+
+ // Create and configure function
+ gemm_layer = std::unique_ptr<Function>(new Function());
+ gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+ // Allocate tensors
+ a.allocator()->allocate();
+ b.allocator()->allocate();
+ c.allocator()->allocate();
+ d.allocator()->allocate();
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ gemm_layer.reset();
+
+ a.allocator()->free();
+ b.allocator()->free();
+ c.allocator()->free();
+ d.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ std::unique_ptr<Function> gemm_layer{ nullptr };
+ Profiler profiler{};
+
+private:
+ TensorType a{};
+ TensorType b{};
+ TensorType c{};
+ TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
diff --git a/tests/benchmark/CL/NormalizationLayer.cpp b/tests/benchmark/CL/NormalizationLayer.cpp
new file mode 100644
index 0000000000..81d3c65912
--- /dev/null
+++ b/tests/benchmark/CL/NormalizationLayer.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNet = NormalizationLayer<AlexNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+using NormalizationLayerGoogLeNet = NormalizationLayer<GoogLeNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+
+} // namespace
+
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ norm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ norm_layer->run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);
diff --git a/tests/benchmark/CL/PoolingLayer.cpp b/tests/benchmark/CL/PoolingLayer.cpp
new file mode 100644
index 0000000000..5285f279e7
--- /dev/null
+++ b/tests/benchmark/CL/PoolingLayer.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNet = PoolingLayer<AlexNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerLeNet5 = PoolingLayer<LeNet5PoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerGoogLeNet = PoolingLayer<GoogLeNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(PoolingLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+// FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000..115333a1b0
--- /dev/null
+++ b/tests/benchmark/CMakeLists.txt
@@ -0,0 +1,100 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+add_library(benchmark STATIC IMPORTED)
+set_target_properties(benchmark PROPERTIES
+ IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libbenchmark.a"
+)
+
+add_library(OpenCL SHARED IMPORTED)
+set_target_properties(OpenCL PROPERTIES
+ IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../build/opencl-1.2-stubs/libOpenCL.so"
+ IMPORTED_NO_SONAME 1
+)
+
+option(ENABLE_PMU_COUNTER "Compile with PMU counter support")
+
+set(ARM_COMPUTE_TARGETS_TO_MEASURE "all" CACHE STRING "Semicolon-separated list of targets to include in validation.")
+
+set(ARM_COMPUTE_ALL_TARGETS
+ NEON
+ CL
+)
+
+if(ARM_COMPUTE_TARGETS_TO_MEASURE STREQUAL "all")
+ set(ARM_COMPUTE_TARGETS_TO_MEASURE ${ARM_COMPUTE_ALL_TARGETS})
+endif()
+
+list(REMOVE_DUPLICATES ARM_COMPUTE_TARGETS_TO_MEASURE)
+
+foreach(TARGET ${ARM_COMPUTE_TARGETS_TO_MEASURE})
+ list(FIND ARM_COMPUTE_ALL_TARGETS ${TARGET} idx)
+
+ if(${idx} LESS 0)
+ message(FATAL_ERROR "The target '${TARGET}' does not exist. It should be one of\n${ARM_COMPUTE_ALL_TARGETS}")
+ else()
+ add_subdirectory(${TARGET})
+ endif()
+endforeach()
+
+set(arm_compute_test_benchmark_SOURCE_FILES
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/Datasets.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/Instrument.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/Profiler.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/Profiler.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceProgramOptions.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceProgramOptions.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceUserConfiguration.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceUserConfiguration.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/WallClockTimer.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/WallClockTimer.cpp
+)
+
+if(${ENABLE_PMU_COUNTER})
+ list(APPEND arm_compute_test_benchmark_SOURCE_FILES
+ ${CMAKE_CURRENT_SOURCE_DIR}/PMUCounter.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/PMUCounter.cpp
+ )
+endif()
+
+add_library(arm_compute_test_benchmark OBJECT
+ ${arm_compute_test_benchmark_SOURCE_FILES}
+)
+
+add_definitions(${arm_compute_test_benchmark_TARGET_DEFINITIONS})
+include_directories(${arm_compute_test_benchmark_TARGET_INCLUDES})
+
+add_executable(arm_compute_benchmark
+ $<TARGET_OBJECTS:arm_compute_test_benchmark>
+ ${arm_compute_test_benchmark_TARGET_OBJECTS}
+ $<TARGET_OBJECTS:tensor_library>
+ $<TARGET_OBJECTS:arm_compute_test>
+)
+
+target_link_libraries(arm_compute_benchmark
+ benchmark
+ boost_program_options
+ arm_compute
+ ${arm_compute_test_benchmark_TARGET_LIBRARIES}
+)
diff --git a/tests/benchmark/Datasets.h b/tests/benchmark/Datasets.h
new file mode 100644
index 0000000000..e7bfb6f10f
--- /dev/null
+++ b/tests/benchmark/Datasets.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+
+#include "dataset/ActivationLayerDataset.h"
+#include "dataset/BorderModeDataset.h"
+#include "dataset/ConvolutionLayerDataset.h"
+#include "dataset/DataTypeDatasets.h"
+#include "dataset/FullyConnectedLayerDataset.h"
+#include "dataset/GEMMDataset.h"
+#include "dataset/ImageDatasets.h"
+#include "dataset/InterpolationPolicyDataset.h"
+#include "dataset/NormalizationLayerDataset.h"
+#include "dataset/PoolingLayerDataset.h"
+#include "dataset/ShapeDatasets.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, int N>
+void DataSetArg(::benchmark::internal::Benchmark *b)
+{
+ b->Arg(N);
+ b->ArgName(std::string(*(DataSet().begin() + N)));
+}
+
+template <typename DataSet, int N, unsigned int... Args>
+void DataSetArgBatched(::benchmark::internal::Benchmark *b)
+{
+ constexpr std::array<unsigned int, sizeof...(Args)> batches{ { Args... } };
+ for(const auto &el : batches)
+ {
+ b->Args({ N, static_cast<int>(el) });
+ }
+ b->ArgNames({ std::string(*(DataSet().begin() + N)), "batch_size" });
+}
+
+template <typename DataSet>
+void DataSetArgs(::benchmark::internal::Benchmark *b)
+{
+ for(size_t i = 0; i < DataSet().size(); ++i)
+ {
+ b->Arg(i);
+ b->ArgName(*(DataSet().begin() + i));
+ }
+}
+}
+}
+}
+#endif
diff --git a/tests/benchmark/Instrument.h b/tests/benchmark/Instrument.h
new file mode 100644
index 0000000000..39b0088670
--- /dev/null
+++ b/tests/benchmark/Instrument.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+
+#include "Utils.h"
+
+#include <memory>
+#include <string>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Interface for classes that can be used to measure performance. */
+class Instrument
+{
+public:
+ /** Interface defining a measurement, e.g. time, cycles, ... */
+ class IMeasurement
+ {
+ public:
+ IMeasurement() = default;
+ IMeasurement(const IMeasurement &) = default;
+ IMeasurement(IMeasurement &&) = default;
+ IMeasurement &operator=(const IMeasurement &) = default;
+ IMeasurement &operator=(IMeasurement &&) = default;
+ virtual ~IMeasurement() = default;
+
+ virtual operator double() const = 0;
+ };
+
+ /** Implementation of a Measurement class for arihtmetic types. */
+ template <typename T>
+ class Measurement : public IMeasurement
+ {
+ public:
+ /** Store the given value as measurement.
+ *
+ * @param[in] value Measured value.
+ */
+ Measurement(T value);
+
+ operator double() const override;
+
+ private:
+ T _value;
+ };
+
+ Instrument() = default;
+ Instrument(const Instrument &) = default;
+ Instrument(Instrument &&) = default;
+ Instrument &operator=(const Instrument &) = default;
+ Instrument &operator=(Instrument &&) = default;
+ virtual ~Instrument() = default;
+
+ /** Identifier for the instrument */
+ virtual std::string id() const = 0;
+
+ /** Start measuring. */
+ virtual void start() = 0;
+
+ /** Stop measuring. */
+ virtual void stop() = 0;
+
+ /** Return the latest measurement. */
+ virtual std::unique_ptr<IMeasurement> get_measurement() const = 0;
+};
+
+template <typename T>
+Instrument::Measurement<T>::Measurement(T value)
+ : _value{ value }
+{
+}
+
+template <typename T>
+Instrument::Measurement<T>::operator double() const
+{
+ return _value;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/NEON/ActivationLayer.cpp b/tests/benchmark/NEON/ActivationLayer.cpp
new file mode 100644
index 0000000000..8faed9f831
--- /dev/null
+++ b/tests/benchmark/NEON/ActivationLayer.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNetF32 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer>;
+using ActivationLayerAlexNetQS8 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::QS8>;
+using ActivationLayerLeNet5 = ActivationLayer<LeNet5ActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+using ActivationLayerGoogLeNet = ActivationLayer<GoogLeNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ act_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/BitwiseAnd.cpp b/tests/benchmark/NEON/BitwiseAnd.cpp
new file mode 100644
index 0000000000..dba3d1ebea
--- /dev/null
+++ b/tests/benchmark/NEON/BitwiseAnd.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const std::string image_name = *(DataSet().begin() + state.range(0));
+
+ // Create tensors
+ src1 = create_tensor(image_name, DataType::U8);
+ src2 = create_tensor(image_name, DataType::U8);
+ dst = create_tensor(image_name, DataType::U8);
+
+ // Create and configure function
+ band.configure(&src1, &src2, &dst);
+
+ // Allocate tensors
+ src1.allocator()->allocate();
+ src2.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill source tensors
+ library->fill(NEAccessor(src1), image_name, Channel::R);
+ library->fill(NEAccessor(src2), image_name, Channel::G);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ profiler.submit(state);
+ }
+
+ NEBitwiseAnd band{};
+ Profiler profiler{};
+
+private:
+ Tensor src1{};
+ Tensor src2{};
+ Tensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, neon_bitwise_and)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ band.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, neon_bitwise_and)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ band.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);
diff --git a/tests/benchmark/NEON/CMakeLists.txt b/tests/benchmark/NEON/CMakeLists.txt
new file mode 100644
index 0000000000..2cb3eb36c9
--- /dev/null
+++ b/tests/benchmark/NEON/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+set(arm_compute_test_benchmark_NEON_SOURCE_FILES
+ ${CMAKE_SOURCE_DIR}/NEON/NEAccessor.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/Bitwise/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_benchmark_NEON OBJECT
+ ${arm_compute_test_benchmark_NEON_SOURCE_FILES}
+)
+
+SET(arm_compute_test_benchmark_TARGET_OBJECTS
+ ${arm_compute_test_benchmark_TARGET_OBJECTS}
+ $<TARGET_OBJECTS:arm_compute_test_benchmark_NEON>
+ PARENT_SCOPE
+)
diff --git a/tests/benchmark/NEON/ConvolutionLayer.cpp b/tests/benchmark/NEON/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..0cfff8494b
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayer.cpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNetF32 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerAlexNetQS8 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer, DataType::QS8>;
+using ConvolutionLayerLeNet5 = ConvolutionLayer<LeNet5ConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, Tensor, NEAccessor, NEConvolutionLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/ConvolutionLayerDirect.cpp b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp
new file mode 100644
index 0000000000..bc56e844d8
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerDirectAlexNet = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEDirectConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ conv_layer->run();
+ profiler.stop();
+ }
+}
+
+// Registr only the 3x3 convolution layers
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/FullyConnectedLayer.cpp b/tests/benchmark/NEON/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..85979203ac
--- /dev/null
+++ b/tests/benchmark/NEON/FullyConnectedLayer.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNetF32 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerAlexNetQS8 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer, DataType::QS8>;
+using FullyConnectedLayerLeNet5 = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ profiler.stop();
+ }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ fc_layer->run();
+ profiler.stop();
+ }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/GEMM.cpp b/tests/benchmark/NEON/GEMM.cpp
new file mode 100644
index 0000000000..9190309f1c
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.cpp
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/NEON/GEMM.h"
+
+namespace
+{
+#ifdef ENABLE_FP16
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+#endif /* ENABLE_FP16 */
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMQS8GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+using GEMMQS8GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+} // namespace
+#ifdef ENABLE_FP16
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+#endif /* ENABLE_FP16 */
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ gemm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
diff --git a/tests/benchmark/NEON/GEMM.h b/tests/benchmark/NEON/GEMM.h
new file mode 100644
index 0000000000..24d196523f
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+// FIXME: Merge with CL/GEMM.h into common/GEMM.h after adding F16 support to NEON GEMM and QS8 support to CL GEMM
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+#ifdef ENABLE_FP16
+ ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#else /* ENABLE_FP16 */
+ ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#endif /* ENABLE_FP16 */
+
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+ TensorShape shape_a = gemm_obj.shape_a;
+ TensorShape shape_b = gemm_obj.shape_b;
+ TensorShape shape_c = gemm_obj.shape_c;
+ TensorShape shape_d = gemm_obj.shape_d;
+
+ // Create tensors
+ a = create_tensor(shape_a, data_type, 1, 4);
+ b = create_tensor(shape_b, data_type, 1, 4);
+ c = create_tensor(shape_c, data_type, 1, 4);
+ d = create_tensor(shape_d, data_type, 1, 4);
+
+ // Create and configure function
+ gemm_layer = std::unique_ptr<Function>(new Function());
+ gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+ // Allocate tensors
+ a.allocator()->allocate();
+ b.allocator()->allocate();
+ c.allocator()->allocate();
+ d.allocator()->allocate();
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ gemm_layer.reset();
+
+ a.allocator()->free();
+ b.allocator()->free();
+ c.allocator()->free();
+ d.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ std::unique_ptr<Function> gemm_layer{ nullptr };
+ Profiler profiler{};
+
+private:
+ TensorType a{};
+ TensorType b{};
+ TensorType c{};
+ TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
diff --git a/tests/benchmark/NEON/NormalizationLayer.cpp b/tests/benchmark/NEON/NormalizationLayer.cpp
new file mode 100644
index 0000000000..46dc56b84d
--- /dev/null
+++ b/tests/benchmark/NEON/NormalizationLayer.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNetF32 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+using NormalizationLayerAlexNetQS8 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer, DataType::QS8>;
+using NormalizationLayerGoogLeNet = NormalizationLayer<GoogLeNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ norm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ norm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ norm_layer->run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/PoolingLayer.cpp b/tests/benchmark/NEON/PoolingLayer.cpp
new file mode 100644
index 0000000000..9b071317b4
--- /dev/null
+++ b/tests/benchmark/NEON/PoolingLayer.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNetF32 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerAlexNetQS8 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer, DataType::QS8>;
+using PoolingLayerLeNet5 = PoolingLayer<LeNet5PoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerGoogLeNet = PoolingLayer<GoogLeNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run function
+ profiler.start();
+ pool_layer.run();
+ profiler.stop();
+ }
+}
+
+// FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);
diff --git a/tests/benchmark/PMUCounter.cpp b/tests/benchmark/PMUCounter.cpp
new file mode 100644
index 0000000000..e87dae82e6
--- /dev/null
+++ b/tests/benchmark/PMUCounter.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PMUCounter.h"
+
+#include "Utils.h"
+
+#define _GNU_SOURCE 1
+#include <asm/unistd.h>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <stdexcept>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+CycleCounter::CycleCounter()
+{
+ const pid_t pid = getpid();
+
+ struct perf_event_attr perf_config
+ {
+ };
+ memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+ perf_config.config = PERF_COUNT_HW_CPU_CYCLES;
+ perf_config.size = sizeof(struct perf_event_attr);
+ perf_config.type = PERF_TYPE_HARDWARE;
+ // The inherit bit specifies that this counter should count events of child
+ // tasks as well as the task specified
+ perf_config.inherit = 1;
+ // Enables saving of event counts on context switch for inherited tasks
+ perf_config.inherit_stat = 1;
+
+ _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+ if(_fd < 0)
+ {
+ throw std::runtime_error("perf_event_open for cycles failed");
+ }
+}
+
+std::string CycleCounter::id() const
+{
+ return "Cycle Counter";
+}
+
+void CycleCounter::start()
+{
+ ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+ ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void CycleCounter::stop()
+{
+ ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(_fd, &_cycles, sizeof(_cycles));
+}
+
+std::unique_ptr<Instrument::IMeasurement> CycleCounter::get_measurement() const
+{
+ return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<long long>>(_cycles);
+}
+
+InstructionCounter::InstructionCounter()
+{
+ const pid_t pid = getpid();
+
+ struct perf_event_attr perf_config
+ {
+ };
+ memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+ perf_config.config = PERF_COUNT_HW_INSTRUCTIONS;
+ perf_config.size = sizeof(struct perf_event_attr);
+ perf_config.type = PERF_TYPE_HARDWARE;
+ // The inherit bit specifies that this counter should count events of child
+ // tasks as well as the task specified
+ perf_config.inherit = 1;
+ // Enables saving of event counts on context switch for inherited tasks
+ perf_config.inherit_stat = 1;
+
+ _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+ if(_fd < 0)
+ {
+ throw std::runtime_error("perf_event_open for instructions failed");
+ }
+}
+
+std::string InstructionCounter::id() const
+{
+ return "Instruction Counter";
+}
+
+void InstructionCounter::start()
+{
+ ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+ ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void InstructionCounter::stop()
+{
+ ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(_fd, &_instructions, sizeof(_instructions));
+}
+
+std::unique_ptr<Instrument::IMeasurement> InstructionCounter::get_measurement() const
+{
+ return std::unique_ptr<Instrument::IMeasurement>(new Instrument::Measurement<long long>(_instructions));
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PMUCounter.h b/tests/benchmark/PMUCounter.h
new file mode 100644
index 0000000000..de45f319f6
--- /dev/null
+++ b/tests/benchmark/PMUCounter.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+
+#include "Instrument.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to count CPU cycles. */
+class CycleCounter : public Instrument
+{
+public:
+ /** Initialise the cycle counter. */
+ CycleCounter();
+
+ std::string id() const override;
+ void start() override;
+ void stop() override;
+ std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+ long _fd{ -1 };
+ long long _cycles{ 0 };
+};
+
+/** Implementation of an instrument to count executed CPU instructions. */
+class InstructionCounter : public Instrument
+{
+public:
+ /** Initialise the instruction counter. */
+ InstructionCounter();
+
+ std::string id() const override;
+ void start() override;
+ void stop() override;
+ std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+ long _fd{ -1 };
+ long long _instructions{ 0 };
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/PerformanceProgramOptions.cpp b/tests/benchmark/PerformanceProgramOptions.cpp
new file mode 100644
index 0000000000..b4becc3c69
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceProgramOptions.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#include "boost/program_options.hpp"
+#pragma GCC diagnostic pop
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceProgramOptions::PerformanceProgramOptions()
+{
+ boost::program_options::options_description options("Performance options");
+ options.add_options()("runs", boost::program_options::value<unsigned int>()->default_value(1), "Repetitions per test");
+ options.add_options()("threads", boost::program_options::value<unsigned int>()->default_value(1), "Number of parallel CPU threads");
+ add_options(options);
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PerformanceProgramOptions.h b/tests/benchmark/PerformanceProgramOptions.h
new file mode 100644
index 0000000000..671e263bb2
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+/** Subclass of @ref ProgramOptions that adds performance specific options. */
+class PerformanceProgramOptions : public ProgramOptions
+{
+public:
+ /** Defines additonal options. */
+ PerformanceProgramOptions();
+};
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/PerformanceUserConfiguration.cpp b/tests/benchmark/PerformanceUserConfiguration.cpp
new file mode 100644
index 0000000000..ca412d660a
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceUserConfiguration.h"
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceUserConfiguration::PerformanceUserConfiguration(const ProgramOptions &options)
+ : UserConfiguration(options)
+{
+ unsigned int tmp_runs = 0;
+ if(options.get("runs", tmp_runs))
+ {
+ runs = tmp_runs;
+ }
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PerformanceUserConfiguration.h b/tests/benchmark/PerformanceUserConfiguration.h
new file mode 100644
index 0000000000..a140d404c8
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+
+#include "UserConfiguration.h"
+
+namespace arm_compute
+{
+namespace test
+{
+class ProgramOptions;
+
+namespace performance
+{
+/** Specialisation of @ref UserConfiguration to provide performance specific
+ * configuration options.
+ */
+struct PerformanceUserConfiguration : public UserConfiguration
+{
+ PerformanceUserConfiguration() = default;
+
+ /** Initialise the configuration according to the program options.
+ *
+ * @param[in] options Parsed command line options.
+ */
+ PerformanceUserConfiguration(const ProgramOptions &options);
+
+ Option<unsigned int> runs{};
+};
+} // namespace performance
+
+extern performance::PerformanceUserConfiguration user_config;
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/Profiler.cpp b/tests/benchmark/Profiler.cpp
new file mode 100644
index 0000000000..f3ce94164f
--- /dev/null
+++ b/tests/benchmark/Profiler.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Profiler.h"
+
+#include <iostream>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+void Profiler::add(const std::shared_ptr<Instrument> &instrument)
+{
+ _instruments.push_back(instrument);
+}
+
+void Profiler::start()
+{
+ for(auto &instrument : _instruments)
+ {
+ instrument->start();
+ }
+}
+
+void Profiler::stop()
+{
+ for(auto &instrument : _instruments)
+ {
+ instrument->stop();
+ }
+
+ for(const auto &instrument : _instruments)
+ {
+ _measurements[instrument->id()].push_back(*instrument->get_measurement());
+ }
+}
+
+void Profiler::submit(::benchmark::State &state)
+{
+ for(auto &instrument : _measurements)
+ {
+ double sum_values = std::accumulate(instrument.second.begin(), instrument.second.end(), 0.);
+ size_t num_values = instrument.second.size();
+
+ if(num_values > 2)
+ {
+ auto minmax_values = std::minmax_element(instrument.second.begin(), instrument.second.end());
+ state.counters[instrument.first + "_min"] = *minmax_values.first;
+ state.counters[instrument.first + "_max"] = *minmax_values.second;
+ sum_values -= *minmax_values.first + *minmax_values.second;
+ num_values -= 2;
+ }
+ state.counters[instrument.first] = sum_values / num_values;
+ instrument.second.clear();
+ }
+}
+
+const Profiler::MeasurementsMap &Profiler::measurements() const
+{
+ return _measurements;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/Profiler.h b/tests/benchmark/Profiler.h
new file mode 100644
index 0000000000..03922f4704
--- /dev/null
+++ b/tests/benchmark/Profiler.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+
+#include "Instrument.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+class Profiler
+{
+public:
+ /** Mapping from instrument ids to their measurements. */
+ using MeasurementsMap = std::map<std::string, std::vector<double>>;
+
+ /** Add @p instrument to the performance montior.
+ *
+ * All added instruments will be used when @ref start or @ref stop are
+ * called to make measurements.
+ *
+ * @param[in] instrument Instrument to be used to measure performance.
+ */
+ void add(const std::shared_ptr<Instrument> &instrument);
+
+ /** Start all added instruments to measure performance. */
+ void start();
+
+ /** Stop all added instruments. */
+ void stop();
+
+ /** Commit all measured values to the current active test. */
+ void submit(::benchmark::State &state);
+
+ /** Return measurements for all instruments. */
+ const MeasurementsMap &measurements() const;
+
+private:
+ std::vector<std::shared_ptr<Instrument>> _instruments{};
+ MeasurementsMap _measurements{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/WallClockTimer.cpp b/tests/benchmark/WallClockTimer.cpp
new file mode 100644
index 0000000000..9ab53d0b3c
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WallClockTimer.h"
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+std::string WallClockTimer::id() const
+{
+ return "Wall clock";
+}
+
+void WallClockTimer::start()
+{
+ _start = std::chrono::high_resolution_clock::now();
+}
+
+void WallClockTimer::stop()
+{
+ _stop = std::chrono::high_resolution_clock::now();
+}
+
+std::unique_ptr<Instrument::IMeasurement> WallClockTimer::get_measurement() const
+{
+ const std::chrono::duration<float, std::milli> delta = _stop - _start;
+ return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<float>>(delta.count());
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/WallClockTimer.h b/tests/benchmark/WallClockTimer.h
new file mode 100644
index 0000000000..cf6828e88d
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+
+#include "Instrument.h"
+
+#include <chrono>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to measure elapsed wall-clock time in milliseconds. */
+class WallClockTimer : public Instrument
+{
+public:
+ std::string id() const override;
+ void start() override;
+ void stop() override;
+ std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+ std::chrono::high_resolution_clock::time_point _start{};
+ std::chrono::high_resolution_clock::time_point _stop{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/common/ActivationLayer.h b/tests/benchmark/common/ActivationLayer.h
new file mode 100644
index 0000000000..7edfb6ef3c
--- /dev/null
+++ b/tests/benchmark/common/ActivationLayer.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ActivationLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ActivationLayer : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const ActivationLayerDataObject act_obj = *(DataSet().begin() + state.range(0));
+
+ // Set batched in source and destination shapes
+ const unsigned int batches = state.range(1);
+ const unsigned int fixed_point_position = 4;
+ TensorShape shape = act_obj.shape;
+ shape.set(shape.num_dimensions(), batches);
+
+ // Create tensors
+ src = create_tensor(shape, dt, 1, fixed_point_position);
+ dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+ // Create and configure function
+ act_layer.configure(&src, &dst, act_obj.info);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill tensors
+ library->fill_tensor_uniform(Accessor(src), 0);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ src.allocator()->free();
+ dst.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ Function act_layer{};
+ Profiler profiler{};
+
+private:
+ TensorType src{};
+ TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
diff --git a/tests/benchmark/common/ConvolutionLayer.h b/tests/benchmark/common/ConvolutionLayer.h
new file mode 100644
index 0000000000..594c62c50e
--- /dev/null
+++ b/tests/benchmark/common/ConvolutionLayer.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ConvolutionLayer : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const ConvolutionLayerDataObject conv_obj = *(DataSet().begin() + state.range(0));
+
+ // Set batched in source and destination shapes
+ const unsigned int batches = state.range(1);
+ const unsigned int fixed_point_position = 4;
+ TensorShape src_shape = conv_obj.src_shape;
+ TensorShape dst_shape = conv_obj.dst_shape;
+ src_shape.set(3 /* batch */, batches);
+ dst_shape.set(3 /* batch */, batches);
+
+ // Create tensors
+ src = create_tensor(src_shape, dt, 1, fixed_point_position);
+ weights = create_tensor(conv_obj.weights_shape, dt, 1, fixed_point_position);
+ bias = create_tensor(conv_obj.bias_shape, dt, 1, fixed_point_position);
+ dst = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+ // Create and configure function
+ conv_layer = std::unique_ptr<Function>(new Function());
+ conv_layer->configure(&src, &weights, &bias, &dst, conv_obj.info);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ weights.allocator()->allocate();
+ bias.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill tensors
+ library->fill_tensor_uniform(Accessor(src), 0);
+ library->fill_tensor_uniform(Accessor(weights), 1);
+ library->fill_tensor_uniform(Accessor(bias), 2);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ conv_layer.reset();
+
+ src.allocator()->free();
+ weights.allocator()->free();
+ bias.allocator()->free();
+ dst.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ std::unique_ptr<Function> conv_layer{ nullptr };
+ Profiler profiler{};
+
+private:
+ TensorType src{};
+ TensorType weights{};
+ TensorType bias{};
+ TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
diff --git a/tests/benchmark/common/FullyConnectedLayer.h b/tests/benchmark/common/FullyConnectedLayer.h
new file mode 100644
index 0000000000..88adf83f2a
--- /dev/null
+++ b/tests/benchmark/common/FullyConnectedLayer.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class FullyConnectedLayer : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const FullyConnectedLayerDataObject fc_obj = *(DataSet().begin() + state.range(0));
+
+ // Set batched in source and destination shapes
+ const unsigned int batches = state.range(1);
+ const unsigned int fixed_point_position = 4;
+ TensorShape src_shape = fc_obj.src_shape;
+ TensorShape dst_shape = fc_obj.dst_shape;
+ src_shape.set(src_shape.num_dimensions(), batches);
+ dst_shape.set(dst_shape.num_dimensions(), batches);
+
+ // Create tensors
+ src = create_tensor(src_shape, dt, 1, fixed_point_position);
+ weights = create_tensor(fc_obj.weights_shape, dt, 1, fixed_point_position);
+ bias = create_tensor(fc_obj.bias_shape, dt, 1, fixed_point_position);
+ dst = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+ // Create and configure function
+ fc_layer = std::unique_ptr<Function>(new Function());
+ fc_layer->configure(&src, &weights, &bias, &dst);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ weights.allocator()->allocate();
+ bias.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill tensors
+ library->fill_tensor_uniform(Accessor(src), 0);
+ library->fill_tensor_uniform(Accessor(weights), 1);
+ library->fill_tensor_uniform(Accessor(bias), 2);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ fc_layer.reset();
+
+ src.allocator()->free();
+ weights.allocator()->free();
+ bias.allocator()->free();
+ dst.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ std::unique_ptr<Function> fc_layer{ nullptr };
+ Profiler profiler{};
+
+private:
+ TensorType src{};
+ TensorType weights{};
+ TensorType bias{};
+ TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
diff --git a/tests/benchmark/common/NormalizationLayer.h b/tests/benchmark/common/NormalizationLayer.h
new file mode 100644
index 0000000000..4593fb7df3
--- /dev/null
+++ b/tests/benchmark/common/NormalizationLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/NormalizationLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class NormalizationLayer : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const NormalizationLayerDataObject norm_obj = *(DataSet().begin() + state.range(0));
+
+ // Set batched in source and destination shapes
+ const unsigned int batches = state.range(1);
+ const unsigned int fixed_point_position = 4;
+ TensorShape shape = norm_obj.shape;
+ shape.set(shape.num_dimensions(), batches);
+
+ // Create tensors
+ src = create_tensor(shape, dt, 1, fixed_point_position);
+ dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+ // Create and configure function
+ norm_layer = std::unique_ptr<Function>(new Function());
+ norm_layer->configure(&src, &dst, norm_obj.info);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill tensors
+ library->fill_tensor_uniform(Accessor(src), 0);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ norm_layer.reset();
+
+ src.allocator()->free();
+ dst.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ std::unique_ptr<Function> norm_layer{ nullptr };
+ Profiler profiler{};
+
+private:
+ TensorType src{};
+ TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
diff --git a/tests/benchmark/common/PoolingLayer.h b/tests/benchmark/common/PoolingLayer.h
new file mode 100644
index 0000000000..5bb332fd6b
--- /dev/null
+++ b/tests/benchmark/common/PoolingLayer.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/PoolingLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class PoolingLayer : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const PoolingLayerDataObject pool_obj = *(DataSet().begin() + state.range(0));
+
+ // Set batched in source and destination shapes
+ const unsigned int batches = state.range(1);
+ const unsigned int fixed_point_position = 4;
+ TensorShape src_shape = pool_obj.src_shape;
+ TensorShape dst_shape = pool_obj.dst_shape;
+ src_shape.set(src_shape.num_dimensions(), batches);
+ dst_shape.set(dst_shape.num_dimensions(), batches);
+
+ // Create tensors
+ src = create_tensor(src_shape, dt, 1, fixed_point_position);
+ dst = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+ // Create and configure function
+ pool_layer.configure(&src, &dst, pool_obj.info);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ // Fill tensors
+ library->fill_tensor_uniform(Accessor(src), 0);
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ // Free allocators
+ src.allocator()->free();
+ dst.allocator()->free();
+
+ profiler.submit(state);
+ }
+
+ Function pool_layer{};
+ Profiler profiler{};
+
+private:
+ TensorType src{};
+ TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
diff --git a/tests/benchmark/main.cpp b/tests/benchmark/main.cpp
new file mode 100644
index 0000000000..acde259d9b
--- /dev/null
+++ b/tests/benchmark/main.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "PMUCounter.h"
+#include "PerformanceProgramOptions.h"
+#include "PerformanceUserConfiguration.h"
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "WallClockTimer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#ifdef OPENCL
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#endif
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <iostream>
+#include <memory>
+
+using namespace arm_compute::test;
+using namespace arm_compute::test::performance;
+
+namespace arm_compute
+{
+namespace test
+{
+PerformanceUserConfiguration user_config;
+std::unique_ptr<TensorLibrary> library;
+} // namespace test
+} // namespace arm_compute
+
+int main(int argc, char **argv)
+{
+ PerformanceProgramOptions options;
+ try
+ {
+ options.parse_commandline(argc, argv);
+
+ if(options.wants_help())
+ {
+ std::cout << "Usage: " << argv[0] << " [options] PATH\n";
+ std::cout << options.get_help() << "\n";
+ }
+
+ user_config = PerformanceUserConfiguration(options);
+ }
+ catch(const boost::program_options::required_option &err)
+ {
+ std::cerr << "Error: " << err.what() << "\n";
+ std::cout << "\nUsage: " << argv[0] << " [options] PATH\n";
+ std::cout << options.get_help() << "\n";
+ return 1;
+ }
+
+ ::benchmark::Initialize(&argc, argv);
+
+ if(user_config.seed.is_set())
+ {
+ library = cpp14::make_unique<TensorLibrary>(user_config.path.get(), user_config.seed);
+ }
+ else
+ {
+ library = cpp14::make_unique<TensorLibrary>(user_config.path.get());
+ }
+
+#ifdef OPENCL
+ arm_compute::CLScheduler::get().default_init();
+#endif
+
+ std::cout << "Using " << user_config.threads << " CPU " << (user_config.threads == 1 ? "thread" : "threads") << "\n";
+ arm_compute::Scheduler::get().set_num_threads(user_config.threads);
+
+ ::benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/tests/benchmark/system_tests/CL/AlexNet.cpp b/tests/benchmark/system_tests/CL/AlexNet.cpp
new file mode 100644
index 0000000000..fe0b9913de
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/AlexNet.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTest = AlexNetFixture<ICLTensor,
+ CLTensor,
+ CLSubTensor,
+ CLAccessor,
+ CLActivationLayer,
+ CLConvolutionLayer,
+ CLFullyConnectedLayer,
+ CLNormalizationLayer,
+ CLPoolingLayer,
+ CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(AlexNetSystemTest, cl_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run AlexNet
+ profiler.start();
+ network.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTest, cl_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8); \ No newline at end of file
diff --git a/tests/benchmark/system_tests/CL/LeNet5.cpp b/tests/benchmark/system_tests/CL/LeNet5.cpp
new file mode 100644
index 0000000000..d65a7dde6c
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/LeNet5.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<CLTensor,
+ CLAccessor,
+ CLActivationLayer,
+ CLConvolutionLayer,
+ CLFullyConnectedLayer,
+ CLPoolingLayer,
+ CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, cl_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run LeNet5
+ profiler.start();
+ network.run();
+ CLScheduler::get().sync();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, cl_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);
diff --git a/tests/benchmark/system_tests/NEON/AlexNet.cpp b/tests/benchmark/system_tests/NEON/AlexNet.cpp
new file mode 100644
index 0000000000..2d222e7309
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/AlexNet.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTestF32 = AlexNetFixture<ITensor,
+ Tensor,
+ SubTensor,
+ NEAccessor,
+ NEActivationLayer,
+ NEConvolutionLayer,
+ NEFullyConnectedLayer,
+ NENormalizationLayer,
+ NEPoolingLayer,
+ NESoftmaxLayer,
+ DataType::F32>;
+
+using AlexNetSystemTestQS8 = AlexNetFixture<ITensor,
+ Tensor,
+ SubTensor,
+ NEAccessor,
+ NEActivationLayer,
+ NEConvolutionLayer,
+ NEFullyConnectedLayer,
+ NENormalizationLayer,
+ NEPoolingLayer,
+ NESoftmaxLayer,
+ DataType::QS8>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(AlexNetSystemTestF32, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run AlexNet
+ profiler.start();
+ network.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestF32, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
+
+// QS8
+BENCHMARK_DEFINE_F(AlexNetSystemTestQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run AlexNet
+ profiler.start();
+ network.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestQS8, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8); \ No newline at end of file
diff --git a/tests/benchmark/system_tests/NEON/LeNet5.cpp b/tests/benchmark/system_tests/NEON/LeNet5.cpp
new file mode 100644
index 0000000000..5170f05a70
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/LeNet5.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<Tensor,
+ NEAccessor,
+ NEActivationLayer,
+ NEConvolutionLayer,
+ NEFullyConnectedLayer,
+ NEPoolingLayer,
+ NESoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, neon_lenet5)
+(::benchmark::State &state)
+{
+ while(state.KeepRunning())
+ {
+ // Run LeNet5
+ profiler.start();
+ network.run();
+ profiler.stop();
+ }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, neon_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);
diff --git a/tests/benchmark/system_tests/common/AlexNet.h b/tests/benchmark/system_tests/common/AlexNet.h
new file mode 100644
index 0000000000..9c93dc7228
--- /dev/null
+++ b/tests/benchmark/system_tests/common/AlexNet.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/AlexNet.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename ITensorType,
+ typename TensorType,
+ typename SubTensorType,
+ typename Accessor,
+ typename ActivationLayerFunction,
+ typename ConvolutionLayerFunction,
+ typename FullyConnectedLayerFunction,
+ typename NormalizationLayerFunction,
+ typename PoolingLayerFunction,
+ typename SoftmaxLayerFunction,
+ DataType dt = DataType::F32>
+class AlexNetFixture : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ const unsigned int batches = static_cast<unsigned int>(state.range(0));
+ const bool weights_transposed = true;
+
+ network.init_weights(batches, weights_transposed);
+ network.build();
+ network.allocate();
+ network.fill_random();
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ profiler.submit(state);
+ network.clear();
+ }
+
+ Profiler profiler{};
+ model_objects::AlexNet<ITensorType,
+ TensorType,
+ SubTensorType,
+ Accessor,
+ ActivationLayerFunction,
+ ConvolutionLayerFunction,
+ FullyConnectedLayerFunction,
+ NormalizationLayerFunction,
+ PoolingLayerFunction,
+ SoftmaxLayerFunction,
+ dt>
+ network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
diff --git a/tests/benchmark/system_tests/common/LeNet5.h b/tests/benchmark/system_tests/common/LeNet5.h
new file mode 100644
index 0000000000..db34f6813a
--- /dev/null
+++ b/tests/benchmark/system_tests/common/LeNet5.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/LeNet5.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename TensorType,
+ typename Accessor,
+ typename ActivationLayerFunction,
+ typename ConvolutionLayerFunction,
+ typename FullyConnectedLayerFunction,
+ typename PoolingLayerFunction,
+ typename SoftmaxLayerFunction>
+class LeNet5Fixture : public ::benchmark::Fixture
+{
+public:
+ void SetUp(::benchmark::State &state) override
+ {
+ profiler.add(std::make_shared<WallClockTimer>());
+
+ network.build(static_cast<unsigned int>(state.range(0)));
+ network.fill_random();
+ }
+
+ void TearDown(::benchmark::State &state) override
+ {
+ profiler.submit(state);
+ network.clear();
+ }
+
+ Profiler profiler{};
+ model_objects::LeNet5<TensorType,
+ Accessor,
+ ActivationLayerFunction,
+ ConvolutionLayerFunction,
+ FullyConnectedLayerFunction,
+ PoolingLayerFunction,
+ SoftmaxLayerFunction>
+ network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__