9 files changed, 1623 insertions, 0 deletions
diff --git a/tests/benchmark/CL/ActivationLayer.cpp b/tests/benchmark/CL/ActivationLayer.cpp
new file mode 100644
index 0000000000..5180d3d900
--- /dev/null
+++ b/tests/benchmark/CL/ActivationLayer.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNet   = ActivationLayer<AlexNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerLeNet5    = ActivationLayer<LeNet5ActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerGoogLeNet = ActivationLayer<GoogLeNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ActivationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);
diff --git a/tests/benchmark/CL/BitwiseAnd.cpp b/tests/benchmark/CL/BitwiseAnd.cpp
new file mode 100644
index 0000000000..a3deb3eb5b
--- /dev/null
+++ b/tests/benchmark/CL/BitwiseAnd.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ::benchmark::Fixture::SetUp(state);
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const std::string image_name = *(DataSet().begin() + state.range(0));
+
+        // Create tensors
+        src1 = create_tensor(image_name, DataType::U8);
+        src2 = create_tensor(image_name, DataType::U8);
+        dst  = create_tensor(image_name, DataType::U8);
+
+        // Create and configure function
+        band.configure(&src1, &src2, &dst);
+
+        // Allocate tensors
+        src1.allocator()->allocate();
+        src2.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill source tensors
+        library->fill(CLAccessor(src1), image_name, Channel::R);
+        library->fill(CLAccessor(src2), image_name, Channel::G);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+
+        ::benchmark::Fixture::TearDown(state);
+    }
+
+    CLBitwiseAnd band{};
+    Profiler     profiler{};
+
+private:
+    CLTensor src1{};
+    CLTensor src2{};
+    CLTensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);
diff --git a/tests/benchmark/CL/CMakeLists.txt b/tests/benchmark/CL/CMakeLists.txt
new file mode 100644
index 0000000000..8493309f40
--- /dev/null
+++ b/tests/benchmark/CL/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+include_directories(${CMAKE_SOURCE_DIR}/../include)
+
+set(arm_compute_test_benchmark_TARGET_DEFINITIONS
+    ${arm_compute_test_benchmark_TARGET_DEFINITIONS}
+    -DOPENCL
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_INCLUDES
+    ${arm_compute_test_benchmark_TARGET_INCLUDES}
+    ${CMAKE_SOURCE_DIR}/../include
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_OPENCL_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/CL/CLAccessor.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Bitwise/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_benchmark_OPENCL OBJECT
+    ${arm_compute_test_benchmark_OPENCL_SOURCE_FILES}
+)
+
+set(arm_compute_test_benchmark_TARGET_OBJECTS
+    ${arm_compute_test_benchmark_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_benchmark_OPENCL>
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_LIBRARIES
+    ${arm_compute_test_benchmark_TARGET_LIBRARIES}
+    OpenCL
+    PARENT_SCOPE
+)
diff --git a/tests/benchmark/CL/ConvolutionLayer.cpp b/tests/benchmark/CL/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..e1f4fabdc3
--- /dev/null
+++ b/tests/benchmark/CL/ConvolutionLayer.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNet    = ConvolutionLayer<AlexNetConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerLeNet5     = ConvolutionLayer<LeNet5ConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, CLTensor, CLAccessor, CLConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);
diff --git a/tests/benchmark/CL/FullyConnectedLayer.cpp b/tests/benchmark/CL/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..6e8c89fa0b
--- /dev/null
+++ b/tests/benchmark/CL/FullyConnectedLayer.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNet   = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerLeNet5    = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
diff --git a/tests/benchmark/CL/GEMM.cpp b/tests/benchmark/CL/GEMM.cpp
new file mode 100644
index 0000000000..b90556df48
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.cpp
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/CL/GEMM.h"
+
+namespace
+{
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+} // namespace
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
diff --git a/tests/benchmark/CL/GEMM.h b/tests/benchmark/CL/GEMM.h
new file mode 100644
index 0000000000..02a339609c
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+// FIXME: Merge with NEON/GEMM.h into common/GEMM.h after adding F16 support to NEON GEMM and QS8 support to CL GEMM
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32, "Unsupported data type for GEMM operation");
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+        TensorShape shape_a = gemm_obj.shape_a;
+        TensorShape shape_b = gemm_obj.shape_b;
+        TensorShape shape_c = gemm_obj.shape_c;
+        TensorShape shape_d = gemm_obj.shape_d;
+
+        // Create tensors
+        a = create_tensor(shape_a, data_type);
+        b = create_tensor(shape_b, data_type);
+        c = create_tensor(shape_c, data_type);
+        d = create_tensor(shape_d, data_type);
+
+        // Create and configure function
+        gemm_layer = std::unique_ptr<Function>(new Function());
+        gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        c.allocator()->allocate();
+        d.allocator()->allocate();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        gemm_layer.reset();
+
+        a.allocator()->free();
+        b.allocator()->free();
+        c.allocator()->free();
+        d.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> gemm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType a{};
+    TensorType b{};
+    TensorType c{};
+    TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
diff --git a/tests/benchmark/CL/NormalizationLayer.cpp b/tests/benchmark/CL/NormalizationLayer.cpp
new file mode 100644
index 0000000000..81d3c65912
--- /dev/null
+++ b/tests/benchmark/CL/NormalizationLayer.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNet   = NormalizationLayer<AlexNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+using NormalizationLayerGoogLeNet = NormalizationLayer<GoogLeNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+
+} // namespace
+
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);
diff --git a/tests/benchmark/CL/PoolingLayer.cpp b/tests/benchmark/CL/PoolingLayer.cpp
new file mode 100644
index 0000000000..5285f279e7
--- /dev/null
+++ b/tests/benchmark/CL/PoolingLayer.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNet   = PoolingLayer<AlexNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerLeNet5    = PoolingLayer<LeNet5PoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerGoogLeNet = PoolingLayer<GoogLeNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(PoolingLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+// FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);